From 80f8ff530817f6fb66b09d133000d56f677fd449 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= <Gregor.Daiss+git@gmail.com> Date: Sun, 11 Jun 2023 23:32:26 +0200 Subject: [PATCH 01/42] Add stream manager per gpu and set device id for mallocs --- CMakeLists.txt | 29 +++++- include/aggregation_manager.hpp | 94 ++++++++++-------- include/buffer_manager.hpp | 22 ++--- include/cuda_buffer_util.hpp | 43 ++------- include/detail/config.hpp | 64 +++++++++++++ include/hip_buffer_util.hpp | 23 +---- include/stream_manager.hpp | 163 ++++++++++++++++---------------- 7 files changed, 246 insertions(+), 192 deletions(-) create mode 100644 include/detail/config.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 12674d2e..5801e499 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -23,15 +23,17 @@ set(CPPUDDLE_VERSION_STRING "${CPPUDDLE_VERSION_MAJOR}.${CPPUDDLE_VERSION_MINOR} option(CPPUDDLE_WITH_CUDA "Enable CUDA tests/examples" OFF) option(CPPUDDLE_WITH_MULTIGPU_SUPPORT "Enables experimental MultiGPU support" OFF) option(CPPUDDLE_WITH_KOKKOS "Enable KOKKOS tests/examples" OFF) +set(CPPUDDLE_WITH_NUMBER_GPUS "1" CACHE STRING "Number of GPUs that will be used. Should match the number of GPUs used when using the maximum number of HPX worker threads. Should be 1 for non-HPX builds.") # HPX-related options option(CPPUDDLE_WITH_HPX "Enable basic HPX integration and examples" OFF) option(CPPUDDLE_WITH_HPX_AWARE_ALLOCATORS "Enable HPX-aware allocators for even better HPX integration" ON) set(CPPUDDLE_WITH_HPX_MUTEX OFF CACHE BOOL "Use HPX spinlock mutex instead of std::mutex") +set(CPPUDDLE_WITH_MAX_NUMBER_WORKERS "128" CACHE STRING "Max number of workers threads supported. Should match the intended number of HPX workers or be 1 in non-HPX builds.") # Test-related options option(CPPUDDLE_WITH_COUNTERS "Turns on allocations counters. Useful for extended testing" OFF) option(CPPUDDLE_WITH_TESTS "Build tests/examples" OFF) -set(CPPUDDLE_WITH_DEADLOCK_TEST_REPETITONS "100000" CACHE STRING "Number of repetitions for the aggregation executor deadlock tests") +set(CPPUDDLE_WITH_DEADLOCK_TEST_REPETITONS "10000" CACHE STRING "Number of repetitions for the aggregation executor deadlock tests") option(CPPUDDLE_DEACTIVATE_BUFFER_RECYCLING "Deactivates the default recycling behaviour" OFF) option(CPPUDDLE_DEACTIVATE_AGGRESSIVE_ALLOCATORS "Deactivates the aggressive allocators" OFF) # Tooling options @@ -61,6 +63,19 @@ if(CPPUDDLE_WITH_HPX) endif() endif() +if(CPPUDDLE_WITH_NUMBER_GPUS GREATER 1) + if(NOT CPPUDDLE_WITH_HPX_AWARE_ALLOCATORS) + message(FATAL_ERROR " CPPUDDLE_WITH_HPX_AWARE_ALLOCATORS=ON is required Multi-GPU builds!") + endif() +endif() + +if(CPPUDDLE_WITH_MAX_NUMBER_WORKERS GREATER 1) + if(NOT CPPUDDLE_WITH_HPX_AWARE_ALLOCATORS) + message(FATAL_ERROR " CPPUDDLE_WITH_HPX_AWARE_ALLOCATORS=ON is required for Multi-Worker build! \ + Either turn it on or configure with CPPUDDLE_WITH_MAX_NUMBER_WORKERS=1 !") + endif() +endif() + # HPX-aware allocators require HPX-Support. Warn if HPX support is disabled as we fallback on non-aware # allocators if(NOT CPPUDDLE_WITH_HPX) @@ -80,7 +95,7 @@ if (CPPUDDLE_WITH_KOKKOS) find_package(Kokkos 3.0.0 REQUIRED) find_package(HPXKokkos REQUIRED) - # Check that everything required is actyivated + # Check that everything required is activated if (NOT CPPUDDLE_WITH_HPX) message(FATAL_ERROR " KOKKOS support requires HPX flag to be turned on") endif() @@ -149,6 +164,11 @@ if (CPPUDDLE_WITH_HPX) if(CPPUDDLE_WITH_HPX_AWARE_ALLOCATORS) message(INFO " Compiling with HPX-aware allocators!") target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS") + target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_MAX_NUMBER_GPUS=${CPPUDDLE_WITH_NUMBER_GPUS}") + target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_MAX_NUMBER_WORKERS=${CPPUDDLE_WITH_MAX_NUMBER_WORKERS}") + else() + target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_MAX_NUMBER_GPUS=1") + target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_MAX_NUMBER_WORKERS=1") endif() endif() if (CPPUDDLE_WITH_COUNTERS) @@ -164,6 +184,11 @@ if (CPPUDDLE_WITH_HPX) target_compile_definitions(stream_manager INTERFACE "CPPUDDLE_HAVE_HPX") if(CPPUDDLE_WITH_HPX_AWARE_ALLOCATORS) target_compile_definitions(stream_manager INTERFACE "CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS") + target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_MAX_NUMBER_GPUS=${CPPUDDLE_WITH_NUMBER_GPUS}") + target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_MAX_NUMBER_WORKERS=${CPPUDDLE_WITH_MAX_NUMBER_WORKERS}") + else() + target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_MAX_NUMBER_GPUS=1") + target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_MAX_NUMBER_WORKERS=1") endif() endif() if (CPPUDDLE_WITH_COUNTERS) diff --git a/include/aggregation_manager.hpp b/include/aggregation_manager.hpp index fe1de846..47803043 100644 --- a/include/aggregation_manager.hpp +++ b/include/aggregation_manager.hpp @@ -1,3 +1,8 @@ +// Copyright (c) 2022-2023 Gregor Daiß +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + #ifndef WORK_AGGREGATION_MANAGER #define WORK_AGGREGATION_MANAGER @@ -27,7 +32,6 @@ #include <hpx/include/iostreams.hpp> #include <hpx/include/lcos.hpp> #include <hpx/lcos/promise.hpp> -//#include <hpx/synchronization/mutex.hpp> // obsolete #include <hpx/mutex.hpp> #if defined(HPX_HAVE_CUDA) || defined(HPX_HAVE_HIP) @@ -41,12 +45,7 @@ #include "../include/buffer_manager.hpp" #include "../include/stream_manager.hpp" - -#if defined(CPPUDDLE_HAVE_HPX_MUTEX) -using aggregation_mutex_t = hpx::spinlock; -#else -using aggregation_mutex_t = std::mutex; -#endif +#include "../include/detail/config.hpp" //=============================================================================== //=============================================================================== @@ -313,7 +312,8 @@ template <typename Executor> class aggregated_function_call { potential_async_promises[local_counter].get_future(); if (local_counter == number_slices - 1) { /* slices_ready_promise.set_value(); */ - auto fut = exec_async_wrapper<Executor, F, Ts...>(underlying_executor, std::forward<F>(f), std::forward<Ts>(ts)...); + auto fut = exec_async_wrapper<Executor, F, Ts...>( + underlying_executor, std::forward<F>(f), std::forward<Ts>(ts)...); fut.then([this](auto &&fut) { for (auto &promise : potential_async_promises) { promise.set_value(); @@ -808,8 +808,10 @@ template <typename Executor> class Aggregated_Executor { if (local_slice_id == 1) { // Renew promise that all slices will be ready as the primary launch criteria... hpx::lcos::shared_future<void> fut; - if (mode == Aggregated_Executor_Modes::EAGER || mode == Aggregated_Executor_Modes::ENDLESS) { - // Fallback launch condidtion: Launch as soon as the underlying stream is ready + if (mode == Aggregated_Executor_Modes::EAGER || + mode == Aggregated_Executor_Modes::ENDLESS) { + // Fallback launch condidtion: Launch as soon as the underlying stream + // is ready /* auto slices_full_fut = slices_full_promise.get_future(); */ auto exec_fut = executor.get_future(); /* fut = hpx::when_any(exec_fut, slices_full_fut); */ @@ -835,15 +837,16 @@ template <typename Executor> class Aggregated_Executor { } if (local_slice_id >= max_slices && mode != Aggregated_Executor_Modes::ENDLESS) { - slices_exhausted = true; // prevents any more threads from entering before the continuation is launched - /* launched_slices = current_slices; */ - /* size_t id = 0; */ - /* for (auto &slice_promise : executor_slices) { */ - /* slice_promise.set_value( */ - /* Executor_Slice{*this, id, launched_slices}); */ - /* id++; */ - /* } */ - /* executor_slices.clear(); */ + slices_exhausted = true; // prevents any more threads from entering + // before the continuation is launched + /* launched_slices = current_slices; */ + /* size_t id = 0; */ + /* for (auto &slice_promise : executor_slices) { */ + /* slice_promise.set_value( */ + /* Executor_Slice{*this, id, launched_slices}); */ + /* id++; */ + /* } */ + /* executor_slices.clear(); */ if (mode == Aggregated_Executor_Modes::STRICT ) { slices_full_promise.set_value(); // Trigger slices launch condition continuation } @@ -1005,38 +1008,40 @@ class aggregation_pool { template <typename... Ts> static void init(size_t number_of_executors, size_t slices_per_executor, Aggregated_Executor_Modes mode) { - std::lock_guard<aggregation_mutex_t> guard(instance.pool_mutex); - assert(instance.aggregation_executor_pool.empty()); + const size_t gpu_id = get_device_id(); + std::lock_guard<aggregation_mutex_t> guard(instance()[gpu_id].pool_mutex); + assert(instance()[gpu_id].aggregation_executor_pool.empty()); for (int i = 0; i < number_of_executors; i++) { - instance.aggregation_executor_pool.emplace_back(slices_per_executor, + instance()[gpu_id].aggregation_executor_pool.emplace_back(slices_per_executor, mode); } - instance.slices_per_executor = slices_per_executor; - instance.mode = mode; + instance()[gpu_id].slices_per_executor = slices_per_executor; + instance()[gpu_id].mode = mode; } /// Will always return a valid executor slice static decltype(auto) request_executor_slice(void) { - std::lock_guard<aggregation_mutex_t> guard(instance.pool_mutex); - assert(!instance.aggregation_executor_pool.empty()); + const size_t gpu_id = get_device_id(); + std::lock_guard<aggregation_mutex_t> guard(instance()[gpu_id].pool_mutex); + assert(!instance()[gpu_id].aggregation_executor_pool.empty()); std::optional<hpx::lcos::future< typename Aggregated_Executor<Interface>::Executor_Slice>> ret; - size_t local_id = (instance.current_interface) % - instance.aggregation_executor_pool.size(); - ret = instance.aggregation_executor_pool[local_id].request_executor_slice(); + size_t local_id = (instance()[gpu_id].current_interface) % + instance()[gpu_id].aggregation_executor_pool.size(); + ret = instance()[gpu_id].aggregation_executor_pool[local_id].request_executor_slice(); // Expected case: current aggregation executor is free if (ret.has_value()) { return ret; } // current interface is bad -> find free one size_t abort_counter = 0; - const size_t abort_number = instance.aggregation_executor_pool.size() + 1; + const size_t abort_number = instance()[gpu_id].aggregation_executor_pool.size() + 1; do { - local_id = (++(instance.current_interface)) % // increment interface - instance.aggregation_executor_pool.size(); + local_id = (++(instance()[gpu_id].current_interface)) % // increment interface + instance()[gpu_id].aggregation_executor_pool.size(); ret = - instance.aggregation_executor_pool[local_id].request_executor_slice(); + instance()[gpu_id].aggregation_executor_pool[local_id].request_executor_slice(); if (ret.has_value()) { return ret; } @@ -1044,12 +1049,15 @@ class aggregation_pool { } while (abort_counter <= abort_number); // Everything's busy -> create new aggregation executor (growing pool) OR // return empty optional - if (instance.growing_pool) { - instance.aggregation_executor_pool.emplace_back( - instance.slices_per_executor, instance.mode); - instance.current_interface = instance.aggregation_executor_pool.size() - 1; - assert(instance.aggregation_executor_pool.size() < 20480); - ret = instance.aggregation_executor_pool[instance.current_interface].request_executor_slice(); + if (instance()[gpu_id].growing_pool) { + instance()[gpu_id].aggregation_executor_pool.emplace_back( + instance()[gpu_id].slices_per_executor, instance()[gpu_id].mode); + instance()[gpu_id].current_interface = + instance()[gpu_id].aggregation_executor_pool.size() - 1; + assert(instance()[gpu_id].aggregation_executor_pool.size() < 20480); + ret = instance()[gpu_id] + .aggregation_executor_pool[instance()[gpu_id].current_interface] + .request_executor_slice(); assert(ret.has_value()); // fresh executor -- should always have slices // available } @@ -1066,9 +1074,13 @@ class aggregation_pool { private: /// Required for dealing with adding elements to the deque of /// aggregated_executors - static inline aggregation_mutex_t pool_mutex; + aggregation_mutex_t pool_mutex; /// Global access instance - static inline aggregation_pool instance{}; + static std::unique_ptr<aggregation_pool[]>& instance(void) { + static std::unique_ptr<aggregation_pool[]> pool_instances{ + new aggregation_pool[max_number_gpus]}; + return pool_instances; + } aggregation_pool() = default; public: diff --git a/include/buffer_manager.hpp b/include/buffer_manager.hpp index ddbffb70..8907a0e5 100644 --- a/include/buffer_manager.hpp +++ b/include/buffer_manager.hpp @@ -1,4 +1,4 @@ -// Copyright (c) 2020-2021 Gregor Daiß +// Copyright (c) 2020-2023 Gregor Daiß // // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -23,7 +23,7 @@ #ifndef CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS #pragma message \ "Warning: CPPuddle build with HPX support but without HPX-aware allocators enabled. \ -For better performance configure CPPuddle with the cmake option CPPUDDLE_WITH_HPX_AWARE_ALLOCATORS=ON !" +For better performance configure CPPuddle with CPPUDDLE_WITH_HPX_AWARE_ALLOCATORS=ON!" #else // include runtime to get HPX thread IDs required for the HPX-aware allocators #include <hpx/include/runtime.hpp> @@ -39,16 +39,11 @@ For better performance configure CPPuddle with the cmake option CPPUDDLE_WITH_HP #include <boost/core/demangle.hpp> #endif +#include "../include/detail/config.hpp" + namespace recycler { -constexpr size_t number_instances = 128; namespace detail { -#if defined(CPPUDDLE_HAVE_HPX) && defined(CPPUDDLE_HAVE_HPX_MUTEX) -using mutex_t = hpx::spinlock; -#else -using mutex_t = std::mutex; -#endif - class buffer_recycler { // Public interface public: @@ -57,7 +52,7 @@ class buffer_recycler { // Warn about suboptimal performance without recycling #pragma message \ "Warning: Building without buffer recycling! Use only for performance testing! \ -For better performance configure CPPuddle with the cmake option CPPUDDLE_DEACTIVATE_BUFFER_RECYCLING=OFF !" +For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCLING=OFF!" template <typename T, typename Host_Allocator> static T *get(size_t number_elements, bool manage_content_lifetime = false, @@ -258,7 +253,9 @@ For better performance configure CPPuddle with the cmake option CPPUDDLE_DEACTIV return buffer; } catch (std::bad_alloc &e) { // not enough memory left! Cleanup and attempt again: - std::cerr << "Not enough memory left. Cleaning up unused buffers now..." << std::endl; + std::cerr + << "Not enough memory left. Cleaning up unused buffers now..." + << std::endl; buffer_recycler::clean_unused_buffers(); std::cerr << "Buffers cleaned! Try allocation again..." << std::endl; @@ -370,7 +367,6 @@ For better performance configure CPPuddle with the cmake option CPPUDDLE_DEACTIV buffer_manager& operator=(buffer_manager<T, Host_Allocator> &&other) = delete; static std::unique_ptr<buffer_manager[]>& instance(void) { - /* static std::array<buffer_manager, number_instances> instances{{}}; */ static std::unique_ptr<buffer_manager[]> instances{ new buffer_manager[number_instances]}; return instances; @@ -603,7 +599,7 @@ struct aggressive_recycle_allocator { // Warn about suboptimal performance without recycling #pragma message \ "Warning: Building without content reusage for aggressive allocators! \ -For better performance configure with the cmake option CPPUDDLE_DEACTIVATE_AGGRESSIVE_ALLOCATORS=OFF !" +For better performance configure with CPPUDDLE_DEACTIVATE_AGGRESSIVE_ALLOCATORS=OFF !" template <typename... Args> inline void construct(T *p, Args... args) noexcept { ::new (static_cast<void *>(p)) T(std::forward<Args>(args)...); diff --git a/include/cuda_buffer_util.hpp b/include/cuda_buffer_util.hpp index d2d0f596..5d0cb156 100644 --- a/include/cuda_buffer_util.hpp +++ b/include/cuda_buffer_util.hpp @@ -1,4 +1,4 @@ -// Copyright (c) 2020-2021 Gregor Daiß +// Copyright (c) 2020-2023 Gregor Daiß // // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -7,6 +7,7 @@ #define CUDA_BUFFER_UTIL_HPP #include "buffer_manager.hpp" +#include "detail/config.hpp" #include <cuda_runtime.h> #include <stdexcept> @@ -22,6 +23,7 @@ template <class T> struct cuda_pinned_allocator { template <class U> explicit cuda_pinned_allocator(cuda_pinned_allocator<U> const &) noexcept {} T *allocate(std::size_t n) { + cudaSetDevice(get_device_id()); T *data; cudaError_t error = cudaMallocHost(reinterpret_cast<void **>(&data), n * sizeof(T)); @@ -62,6 +64,7 @@ template <class T> struct cuda_device_allocator { template <class U> explicit cuda_device_allocator(cuda_device_allocator<U> const &) noexcept {} T *allocate(std::size_t n) { + cudaSetDevice(get_device_id()); T *data; cudaError_t error = cudaMalloc(&data, n * sizeof(T)); if (error != cudaSuccess) { @@ -114,29 +117,14 @@ struct cuda_device_buffer { device_side_buffer = recycle_allocator_cuda_device<T>{}.allocate(number_of_elements); } + // TODO deprecate and remove gpu_id explicit cuda_device_buffer(size_t number_of_elements, size_t gpu_id) : gpu_id(gpu_id), number_of_elements(number_of_elements), set_id(true) { -#if defined(CPPUDDLE_HAVE_MULTIGPU) - cudaSetDevice(gpu_id); -#else - // TODO It would be better to have separate method for this but it would change the interface - // This will have to do for some testing. If it's worth it, add separate method without cudaSetDevice - // Allows for testing without any changes to other projects - assert(gpu_id == 0); -#endif + assert(gpu_id == 0); device_side_buffer = recycle_allocator_cuda_device<T>{}.allocate(number_of_elements); } ~cuda_device_buffer() { -#if defined(CPPUDDLE_HAVE_MULTIGPU) - if (set_id) - cudaSetDevice(gpu_id); -#else - // TODO It would be better to have separate method for this but it would change the interface - // This will have to do for some testing. If it's worth it, add separate method without cudaSetDevice - // Allows for testing without any changes to other projects - assert(gpu_id == 0); -#endif recycle_allocator_cuda_device<T>{}.deallocate(device_side_buffer, number_of_elements); } @@ -160,29 +148,14 @@ struct cuda_aggregated_device_buffer { device_side_buffer = recycle_allocator_cuda_device<T>{}.allocate(number_of_elements); } + // TODO deprecate and remove gpu_id explicit cuda_aggregated_device_buffer(size_t number_of_elements, size_t gpu_id, Host_Allocator &alloc) : gpu_id(gpu_id), number_of_elements(number_of_elements), set_id(true), alloc(alloc) { -#if defined(CPPUDDLE_HAVE_MULTIGPU) - cudaSetDevice(gpu_id); -#else - // TODO It would be better to have separate method for this but it would change the interface - // This will have to do for some testing. If it's worth it, add separate method without cudaSetDevice - // Allows for testing without any changes to other projects - assert(gpu_id == 0); -#endif + assert(gpu_id == 0); device_side_buffer = alloc.allocate(number_of_elements); } ~cuda_aggregated_device_buffer() { -#if defined(CPPUDDLE_HAVE_MULTIGPU) - if (set_id) - cudaSetDevice(gpu_id); -#else - // TODO It would be better to have separate method for this but it would change the interface - // This will have to do for some testing. If it's worth it, add separate method without cudaSetDevice - // Allows for testing without any changes to other projects - assert(gpu_id == 0); -#endif alloc.deallocate(device_side_buffer, number_of_elements); } diff --git a/include/detail/config.hpp b/include/detail/config.hpp new file mode 100644 index 00000000..fd15cfc2 --- /dev/null +++ b/include/detail/config.hpp @@ -0,0 +1,64 @@ +// Copyright (c) 2023-2023 Gregor Daiß +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#ifndef CPPUDDLE_CONFIG_HPP +#define CPPUDDLE_CONFIG_HPP + +// Mutex configuration +// +#if defined(CPPUDDLE_HAVE_HPX) && defined(CPPUDDLE_HAVE_HPX_MUTEX) +#include <hpx/mutex.hpp> +using mutex_t = hpx::spinlock; +using aggregation_mutex_t = hpx::spinlock; +#else +#include <mutex> +using mutex_t = std::mutex; +using aggregation_mutex_t = std::mutex; +#endif + +// HPX-aware configuration +// +#ifdef CPPUDDLE_HAVE_HPX +#ifndef CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS +#pragma message \ +"Warning: CPPuddle build with HPX support but without HPX-aware allocators enabled. \ +For better performance configure CPPuddle with CPPUDDLE_WITH_HPX_AWARE_ALLOCATORS=ON!" +#else +// include runtime to get HPX thread IDs required for the HPX-aware allocators +#include <hpx/include/runtime.hpp> +#endif +#endif + +// Recycling configuration +// TODO Add warnings here + +// Aggressive recycling configuration +// TODO Add warning here + +// Aggregation Debug configuration +// TODO Add warning here + +// Thread and MultiGPU configuration +// +constexpr size_t number_instances = CPPUDDLE_MAX_NUMBER_WORKERS; +constexpr size_t max_number_gpus = CPPUDDLE_MAX_NUMBER_GPUS; +#ifndef CPPUDDLE_HAVE_HPX +static_assert(max_number_gpus == 1, "Non HPX builds do not support multigpu"); +#endif +static_assert(number_instances >= max_number_gpus); +static_assert(max_number_gpus > 0); +static_assert(number_instances > 0); +constexpr size_t instances_per_gpu = number_instances / max_number_gpus; + +/// Uses HPX thread information to determine which GPU should be used +size_t get_device_id(void) { +#if defined(CPPUDDLE_HAVE_HPX) + return hpx::get_worker_thread_num() / instances_per_gpu; +#else + return 0; +#endif +} + +#endif diff --git a/include/hip_buffer_util.hpp b/include/hip_buffer_util.hpp index 5a4209c1..c33566e5 100644 --- a/include/hip_buffer_util.hpp +++ b/include/hip_buffer_util.hpp @@ -22,6 +22,7 @@ template <class T> struct hip_pinned_allocator { template <class U> explicit hip_pinned_allocator(hip_pinned_allocator<U> const &) noexcept {} T *allocate(std::size_t n) { + hipSetDevice(get_device_id()); T *data; // hipError_t error = // hipMallocHost(reinterpret_cast<void **>(&data), n * sizeof(T)); @@ -68,6 +69,7 @@ template <class T> struct hip_device_allocator { template <class U> explicit hip_device_allocator(hip_device_allocator<U> const &) noexcept {} T *allocate(std::size_t n) { + hipSetDevice(get_device_id()); T *data; hipError_t error = hipMalloc(&data, n * sizeof(T)); if (error != hipSuccess) { @@ -123,16 +125,10 @@ struct hip_device_buffer { } explicit hip_device_buffer(size_t number_of_elements, size_t gpu_id) : gpu_id(gpu_id), number_of_elements(number_of_elements), set_id(true) { - - // TODO Fix Multi GPU support - // hipSetDevice(gpu_id); device_side_buffer = recycle_allocator_hip_device<T>{}.allocate(number_of_elements); } ~hip_device_buffer() { - // TODO Fix Multi GPU support - // if (set_id) - // hipSetDevice(gpu_id); recycle_allocator_hip_device<T>{}.deallocate(device_side_buffer, number_of_elements); } @@ -158,27 +154,12 @@ struct hip_aggregated_device_buffer { } explicit hip_aggregated_device_buffer(size_t number_of_elements, size_t gpu_id, Host_Allocator &alloc) : gpu_id(gpu_id), number_of_elements(number_of_elements), set_id(true), alloc(alloc) { -#if defined(CPPUDDLE_HAVE_MULTIGPU) - hipSetDevice(gpu_id); -#else - // TODO It would be better to have separate method for this but it would change the interface - // This will have to do for some testing. If it's worth it, add separate method without hipSetDevice - // Allows for testing without any changes to other projects assert(gpu_id == 0); -#endif device_side_buffer = alloc.allocate(number_of_elements); } ~hip_aggregated_device_buffer() { -#if defined(CPPUDDLE_HAVE_MULTIGPU) - if (set_id) - hipSetDevice(gpu_id); -#else - // TODO It would be better to have separate method for this but it would change the interface - // This will have to do for some testing. If it's worth it, add separate method without hipSetDevice - // Allows for testing without any changes to other projects assert(gpu_id == 0); -#endif alloc.deallocate(device_side_buffer, number_of_elements); } diff --git a/include/stream_manager.hpp b/include/stream_manager.hpp index 1d48bcd6..87740d2d 100644 --- a/include/stream_manager.hpp +++ b/include/stream_manager.hpp @@ -1,4 +1,4 @@ -// Copyright (c) 2020-2021 Gregor Daiß +// Copyright (c) 2020-2023 Gregor Daiß // // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -7,28 +7,25 @@ #define STREAM_MANAGER_HPP #include <algorithm> +#include <array> #include <cassert> #include <deque> #include <iostream> #include <memory> #include <mutex> #include <queue> +#include <tuple> #include <type_traits> -#if defined(CPPUDDLE_HAVE_HPX) && defined(CPPUDDLE_HAVE_HPX_MUTEX) -// For builds with The HPX mutex -#include <hpx/mutex.hpp> -#endif - -#if defined(CPPUDDLE_HAVE_HPX) && defined(CPPUDDLE_HAVE_HPX_MUTEX) -using mutex_t = hpx::spinlock; -#else -using mutex_t = std::mutex; -#endif +#include "../include/detail/config.hpp" -//#include <cuda_runtime.h> -// #include <hpx/compute/cuda/target.hpp> -// #include <hpx/include/compute.hpp> +/// Turns a std::array_mutex into an scoped lock +template<typename mutex_array_t> +auto make_scoped_lock_from_array(mutex_array_t& mutexes) +{ + return std::apply([](auto&... mutexes) { return std::scoped_lock{mutexes...}; }, + mutexes); +} template <class Interface> class round_robin_pool { private: @@ -38,10 +35,10 @@ template <class Interface> class round_robin_pool { public: template <typename... Ts> - explicit round_robin_pool(size_t number_of_streams, Ts &&... executor_args) { + round_robin_pool(size_t number_of_streams, Ts... executor_args) { ref_counters.reserve(number_of_streams); for (int i = 0; i < number_of_streams; i++) { - pool.emplace_back(std::forward<Ts>(executor_args)...); + pool.emplace_back(executor_args...); ref_counters.emplace_back(0); } } @@ -74,11 +71,11 @@ template <class Interface> class priority_pool { std::vector<size_t> priorities{}; // Ref counters public: template <typename... Ts> - explicit priority_pool(size_t number_of_streams, Ts &&... executor_args) { + priority_pool(size_t number_of_streams, Ts... executor_args) { ref_counters.reserve(number_of_streams); priorities.reserve(number_of_streams); for (auto i = 0; i < number_of_streams; i++) { - pool.emplace_back(std::forward<Ts>(executor_args)...); + pool.emplace_back(executor_args...); ref_counters.emplace_back(0); priorities.emplace_back(i); } @@ -120,11 +117,11 @@ template <class Interface, class Pool> class multi_gpu_round_robin_pool { public: template <typename... Ts> multi_gpu_round_robin_pool(size_t number_of_streams, int number_of_gpus, - Ts &&... executor_args) + Ts... executor_args) : streams_per_gpu{number_of_streams} { for (auto gpu_id = 0; gpu_id < number_of_gpus; gpu_id++) { pool.push_back(std::make_tuple( - Pool(number_of_streams, gpu_id, std::forward<Ts>(executor_args)...), + Pool(number_of_streams, gpu_id, executor_args...), 0)); } } @@ -175,7 +172,7 @@ template <class Interface, class Pool> class priority_pool_multi_gpu { public: template <typename... Ts> priority_pool_multi_gpu(size_t number_of_streams, int number_of_gpus, - Ts &&... executor_args) + Ts... executor_args) : streams_per_gpu(number_of_streams) { ref_counters.reserve(number_of_gpus); priorities.reserve(number_of_gpus); @@ -183,7 +180,7 @@ template <class Interface, class Pool> class priority_pool_multi_gpu { priorities.emplace_back(gpu_id); ref_counters.emplace_back(0); gpu_interfaces.emplace_back(streams_per_gpu, gpu_id, - std::forward<Ts>(executor_args)...); + executor_args...); } } // return a tuple with the interface and its index (to release it later) @@ -222,33 +219,41 @@ template <class Interface, class Pool> class priority_pool_multi_gpu { class stream_pool { public: template <class Interface, class Pool, typename... Ts> - static void init(size_t number_of_streams, Ts &&... executor_args) { - stream_pool_implementation<Interface, Pool>::init( - number_of_streams, std::forward<Ts>(executor_args)...); - } + static void init(size_t number_of_streams, Ts ... executor_args) { + stream_pool_implementation<Interface, Pool>::init(number_of_streams, + executor_args...); +} template <class Interface, class Pool> static void cleanup() { stream_pool_implementation<Interface, Pool>::cleanup(); } template <class Interface, class Pool> static std::tuple<Interface &, size_t> get_interface() { - return stream_pool_implementation<Interface, Pool>::get_interface(); + return stream_pool_implementation<Interface, Pool>::get_interface(get_device_id()); } template <class Interface, class Pool> static void release_interface(size_t index) noexcept { - stream_pool_implementation<Interface, Pool>::release_interface(index); + stream_pool_implementation<Interface, Pool>::release_interface(index, + get_device_id()); } template <class Interface, class Pool> static bool interface_available(size_t load_limit) noexcept { return stream_pool_implementation<Interface, Pool>::interface_available( - load_limit); + load_limit, get_device_id()); } template <class Interface, class Pool> static size_t get_current_load() noexcept { - return stream_pool_implementation<Interface, Pool>::get_current_load(); + return stream_pool_implementation<Interface, Pool>::get_current_load( + get_device_id()); } + // TODO deprecated! Remove... template <class Interface, class Pool> static size_t get_next_device_id() noexcept { - return stream_pool_implementation<Interface, Pool>::get_next_device_id(); + return stream_pool_implementation<Interface, Pool>::get_next_device_id(get_device_id()); + } + + template <class Interface, class Pool> + static size_t set_device_selector(std::function<void(size_t)> select_gpu_function) { + return stream_pool_implementation<Interface, Pool>::set_device_selector(select_gpu_function); } private: @@ -258,64 +263,66 @@ class stream_pool { template <class Interface, class Pool> class stream_pool_implementation { public: template <typename... Ts> - static void init(size_t number_of_streams, Ts &&... executor_args) { - // TODO(daissgr) What should happen if the instance already exists? - // warning? - if (!pool_instance && number_of_streams > 0) { - // NOLINTNEXTLINE(cppcoreguidelines-owning-memory) - pool_instance.reset(new stream_pool_implementation()); - // NOLINTNEXTLINE(cppcoreguidelines-owning-memory) - pool_instance->streampool.reset( - new Pool{number_of_streams, std::forward<Ts>(executor_args)...}); + static void init(size_t number_of_streams, Ts ... executor_args) { + auto guard = make_scoped_lock_from_array(instance().gpu_mutexes); + if (number_of_streams > 0) { + for (size_t gpu_id = 0; gpu_id < max_number_gpus; gpu_id++) { + instance().select_gpu_function(gpu_id); + instance().streampools.emplace_back(number_of_streams, + executor_args...); + } } } + + // TODO add/rename into finalize? static void cleanup() { - std::lock_guard<mutex_t> guard(pool_mut); - if (pool_instance) { - pool_instance->streampool.reset(nullptr); - pool_instance.reset(nullptr); - } + auto guard = make_scoped_lock_from_array(instance().gpu_mutexes); + instance().streampools.clear(); } - static std::tuple<Interface &, size_t> get_interface() { - std::lock_guard<mutex_t> guard(pool_mut); - assert(pool_instance); // should already be initialized - return pool_instance->streampool->get_interface(); + static std::tuple<Interface &, size_t> get_interface(const size_t gpu_id = 0) { + std::lock_guard<mutex_t> guard(instance().gpu_mutexes[gpu_id]); + return instance().streampools[gpu_id].get_interface(); } - static void release_interface(size_t index) { - std::lock_guard<mutex_t> guard(pool_mut); - assert(pool_instance); // should already be initialized - pool_instance->streampool->release_interface(index); + static void release_interface(size_t index, const size_t gpu_id = 0) { + std::lock_guard<mutex_t> guard(instance().gpu_mutexes[gpu_id]); + instance().streampools[gpu_id].release_interface(index); } - static bool interface_available(size_t load_limit) { - std::lock_guard<mutex_t> guard(pool_mut); - if (!pool_instance) { - return false; - } - return pool_instance->streampool->interface_available(load_limit); + static bool interface_available(size_t load_limit, const size_t gpu_id = 0) { + std::lock_guard<mutex_t> guard(instance().gpu_mutexes[gpu_id]); + return instance().streampools[gpu_id].interface_available(load_limit); } - static size_t get_current_load() { - std::lock_guard<mutex_t> guard(pool_mut); - if (!pool_instance) { - return 0; - } - assert(pool_instance); // should already be initialized - return pool_instance->streampool->get_current_load(); + static size_t get_current_load(const size_t gpu_id = 0) { + std::lock_guard<mutex_t> guard(instance().gpu_mutexes[gpu_id]); + return instance().streampools[gpu_id].get_current_load(); } - static size_t get_next_device_id() { - std::lock_guard<mutex_t> guard(pool_mut); - if (!pool_instance) { - return 0; - } - return pool_instance->streampool->get_next_device_id(); + // TODO deprecated! Remove... + static size_t get_next_device_id(const size_t gpu_id = 0) { + std::lock_guard<mutex_t> guard(instance().gpu_mutexes[gpu_id]); + return instance().streampools[gpu_id].get_next_device_id(); + } + + static size_t set_device_selector(std::function<void(size_t)> select_gpu_function) { + auto guard = make_scoped_lock_from_array(instance().gpu_mutexes); + return instance().select_gpu_function = select_gpu_function; } private: - inline static std::unique_ptr<stream_pool_implementation> pool_instance{}; stream_pool_implementation() = default; - inline static mutex_t pool_mut{}; + mutex_t pool_mut{}; + std::function<void(size_t)> select_gpu_function = [](size_t gpu_id) { + // By default no multi gpu support + assert(max_number_gpus == 1); + assert(gpu_id == 0); + }; + + std::deque<Pool> streampools{}; + std::array<mutex_t, max_number_gpus> gpu_mutexes; - std::unique_ptr<Pool> streampool{nullptr}; + static stream_pool_implementation& instance(void) { + static stream_pool_implementation pool_instance{}; + return pool_instance; + } public: ~stream_pool_implementation() = default; @@ -338,13 +345,9 @@ class stream_pool { stream_pool &operator=(stream_pool &&other) = delete; }; -/* template <class Interface, class Pool> */ -/* std::unique_ptr<stream_pool::stream_pool_implementation<Interface, Pool>> */ -/* stream_pool::stream_pool_implementation<Interface, Pool>::pool_instance{}; */ - template <class Interface, class Pool> class stream_interface { public: - explicit stream_interface() + stream_interface() : t(stream_pool::get_interface<Interface, Pool>()), interface(std::get<0>(t)), interface_index(std::get<1>(t)) {} From 1685f84278b80060399f8e74d09cb8c906cd2801 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= <Gregor.Daiss+git@gmail.com> Date: Sun, 11 Jun 2023 23:43:45 +0200 Subject: [PATCH 02/42] Use correct recycler instance per gpu --- include/aggregation_manager.hpp | 3 ++- include/sycl_buffer_util.hpp | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/include/aggregation_manager.hpp b/include/aggregation_manager.hpp index 47803043..35139495 100644 --- a/include/aggregation_manager.hpp +++ b/include/aggregation_manager.hpp @@ -565,7 +565,8 @@ template <typename Executor> class Aggregated_Executor { // get prefered location: aka the current hpx threads location // Usually handy for CPU builds where we want to use the buffers // close to the current CPU core - location_id = hpx::get_worker_thread_num(); + location_id = (hpx::get_worker_thread_num() / instances_per_gpu) * instances_per_gpu; + // division makes sure that we always use the same instance to store our gpu buffers. } #endif // Get shiny and new buffer that will be shared between all slices diff --git a/include/sycl_buffer_util.hpp b/include/sycl_buffer_util.hpp index 6469aa4e..61d22f8f 100644 --- a/include/sycl_buffer_util.hpp +++ b/include/sycl_buffer_util.hpp @@ -16,6 +16,8 @@ namespace recycler { namespace detail { +static_assert(max_number_gpus == 1, "CPPuddle currently does not support MultiGPU SYCL builds!"); + template <class T> struct sycl_host_default_allocator { using value_type = T; sycl_host_default_allocator() noexcept = default; From d3423b433e5a5013f60ecc4a13f43cb3c62cf637 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= <Gregor.Daiss+git@gmail.com> Date: Mon, 12 Jun 2023 00:28:15 +0200 Subject: [PATCH 03/42] Make global function inline --- include/detail/config.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/detail/config.hpp b/include/detail/config.hpp index fd15cfc2..4c8eca37 100644 --- a/include/detail/config.hpp +++ b/include/detail/config.hpp @@ -53,7 +53,7 @@ static_assert(number_instances > 0); constexpr size_t instances_per_gpu = number_instances / max_number_gpus; /// Uses HPX thread information to determine which GPU should be used -size_t get_device_id(void) { +inline size_t get_device_id(void) { #if defined(CPPUDDLE_HAVE_HPX) return hpx::get_worker_thread_num() / instances_per_gpu; #else From 493710f504e34dec9c16ab81723cbe5c57f177b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= <Gregor.Daiss+git@gmail.com> Date: Mon, 12 Jun 2023 17:49:30 +0200 Subject: [PATCH 04/42] Fix default location id --- include/aggregation_manager.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/aggregation_manager.hpp b/include/aggregation_manager.hpp index 35139495..2682a55c 100644 --- a/include/aggregation_manager.hpp +++ b/include/aggregation_manager.hpp @@ -559,7 +559,7 @@ template <typename Executor> class Aggregated_Executor { // Default location -- useful for GPU builds as we otherwise create way too // many different buffers for different aggregation sizes on different GPUs - size_t location_id = 0; + size_t location_id = (hpx::get_worker_thread_num() / instances_per_gpu) * instances_per_gpu; #ifdef CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS if (max_slices == 1) { // get prefered location: aka the current hpx threads location From 3c1c3510d13574e24ce52524d41ea4452e3b7a21 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= <Gregor.Daiss+git@gmail.com> Date: Mon, 12 Jun 2023 19:56:31 +0200 Subject: [PATCH 05/42] Testing extra flag for multi-gpu --- include/cuda_buffer_util.hpp | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/include/cuda_buffer_util.hpp b/include/cuda_buffer_util.hpp index 5d0cb156..bbe01349 100644 --- a/include/cuda_buffer_util.hpp +++ b/include/cuda_buffer_util.hpp @@ -17,13 +17,14 @@ namespace recycler { namespace detail { -template <class T> struct cuda_pinned_allocator { +template <class T, bool auto_select_device = true> struct cuda_pinned_allocator { using value_type = T; cuda_pinned_allocator() noexcept = default; template <class U> explicit cuda_pinned_allocator(cuda_pinned_allocator<U> const &) noexcept {} T *allocate(std::size_t n) { - cudaSetDevice(get_device_id()); + if constexpr (auto_select_device) + cudaSetDevice(get_device_id()); T *data; cudaError_t error = cudaMallocHost(reinterpret_cast<void **>(&data), n * sizeof(T)); @@ -58,13 +59,14 @@ constexpr bool operator!=(cuda_pinned_allocator<T> const &, return false; } -template <class T> struct cuda_device_allocator { +template <class T, bool auto_select_device = true> struct cuda_device_allocator { using value_type = T; cuda_device_allocator() noexcept = default; template <class U> explicit cuda_device_allocator(cuda_device_allocator<U> const &) noexcept {} T *allocate(std::size_t n) { - cudaSetDevice(get_device_id()); + if constexpr (auto_select_device) + cudaSetDevice(get_device_id()); T *data; cudaError_t error = cudaMalloc(&data, n * sizeof(T)); if (error != cudaSuccess) { @@ -87,25 +89,25 @@ template <class T> struct cuda_device_allocator { } } }; -template <class T, class U> -constexpr bool operator==(cuda_device_allocator<T> const &, - cuda_device_allocator<U> const &) noexcept { +template <class T, class U, bool auto_select_T, bool auto_select_U> +constexpr bool operator==(cuda_device_allocator<T, auto_select_T> const &, + cuda_device_allocator<U, auto_select_U> const &) noexcept { return true; } -template <class T, class U> -constexpr bool operator!=(cuda_device_allocator<T> const &, - cuda_device_allocator<U> const &) noexcept { +template <class T, class U, bool auto_select_T, bool auto_select_U> +constexpr bool operator!=(cuda_device_allocator<T, auto_select_T> const &, + cuda_device_allocator<U, auto_select_U> const &) noexcept { return false; } } // end namespace detail -template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0> +template <typename T, bool auto_select_device = true, std::enable_if_t<std::is_trivial<T>::value, int> = 0> using recycle_allocator_cuda_host = - detail::aggressive_recycle_allocator<T, detail::cuda_pinned_allocator<T>>; -template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0> + detail::aggressive_recycle_allocator<T, detail::cuda_pinned_allocator<T, auto_select_device>>; +template <typename T, bool auto_select_device = true, std::enable_if_t<std::is_trivial<T>::value, int> = 0> using recycle_allocator_cuda_device = - detail::recycle_allocator<T, detail::cuda_device_allocator<T>>; + detail::recycle_allocator<T, detail::cuda_device_allocator<T, auto_select_device>>; template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0> struct cuda_device_buffer { From 5a44383f57f4bf9c20e983d08e27319e3d81cc3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= <Gregor.Daiss+git@gmail.com> Date: Mon, 12 Jun 2023 16:40:05 -0500 Subject: [PATCH 06/42] Remove obsolete multigpu stream pools --- include/stream_manager.hpp | 137 ++++++++----------------------------- tests/stream_test.cpp | 93 +------------------------ 2 files changed, 30 insertions(+), 200 deletions(-) diff --git a/include/stream_manager.hpp b/include/stream_manager.hpp index 87740d2d..ec1eeeda 100644 --- a/include/stream_manager.hpp +++ b/include/stream_manager.hpp @@ -107,114 +107,6 @@ template <class Interface> class priority_pool { } }; -template <class Interface, class Pool> class multi_gpu_round_robin_pool { -private: - using gpu_entry = std::tuple<Pool, size_t>; // interface, ref counter - std::deque<gpu_entry> pool{}; - size_t current_interface{0}; - size_t streams_per_gpu{0}; - -public: - template <typename... Ts> - multi_gpu_round_robin_pool(size_t number_of_streams, int number_of_gpus, - Ts... executor_args) - : streams_per_gpu{number_of_streams} { - for (auto gpu_id = 0; gpu_id < number_of_gpus; gpu_id++) { - pool.push_back(std::make_tuple( - Pool(number_of_streams, gpu_id, executor_args...), - 0)); - } - } - - // return a tuple with the interface and its index (to release it later) - std::tuple<Interface &, size_t> get_interface() { - size_t last_interface = current_interface; - current_interface = (current_interface + 1) % pool.size(); - std::get<1>(pool[last_interface])++; - size_t gpu_offset = last_interface * streams_per_gpu; - std::tuple<Interface &, size_t> stream_entry = - std::get<0>(pool[last_interface]).get_interface(); - std::get<1>(stream_entry) += gpu_offset; - return stream_entry; - } - void release_interface(size_t index) { - size_t gpu_index = index / streams_per_gpu; - size_t stream_index = index % streams_per_gpu; - std::get<1>(pool[gpu_index])--; - std::get<0>(pool[gpu_index]).release_interface(stream_index); - } - bool interface_available(size_t load_limit) { - auto ¤t_min_gpu = std::get<0>(*(std::min_element( - std::begin(pool), std::end(pool), - [](const gpu_entry &first, const gpu_entry &second) -> bool { - return std::get<1>(first) < std::get<1>(second); - }))); - return current_min_gpu.interface_available(load_limit); - } - size_t get_current_load() { - auto ¤t_min_gpu = std::get<0>(*(std::min_element( - std::begin(pool), std::end(pool), - [](const gpu_entry &first, const gpu_entry &second) -> bool { - return std::get<1>(first) < std::get<1>(second); - }))); - return current_min_gpu.get_current_load(); - } - size_t get_next_device_id() { return current_interface; } -}; - -template <class Interface, class Pool> class priority_pool_multi_gpu { -private: - std::vector<size_t> priorities{}; - std::vector<size_t> ref_counters{}; - std::deque<Pool> gpu_interfaces{}; - size_t streams_per_gpu{0}; - -public: - template <typename... Ts> - priority_pool_multi_gpu(size_t number_of_streams, int number_of_gpus, - Ts... executor_args) - : streams_per_gpu(number_of_streams) { - ref_counters.reserve(number_of_gpus); - priorities.reserve(number_of_gpus); - for (auto gpu_id = 0; gpu_id < number_of_gpus; gpu_id++) { - priorities.emplace_back(gpu_id); - ref_counters.emplace_back(0); - gpu_interfaces.emplace_back(streams_per_gpu, gpu_id, - executor_args...); - } - } - // return a tuple with the interface and its index (to release it later) - std::tuple<Interface &, size_t> get_interface() { - auto gpu = priorities[0]; - ref_counters[gpu]++; - std::make_heap(std::begin(priorities), std::end(priorities), - [this](const size_t &first, const size_t &second) -> bool { - return ref_counters[first] > ref_counters[second]; - }); - size_t gpu_offset = gpu * streams_per_gpu; - auto stream_entry = gpu_interfaces[gpu].get_interface(); - std::get<1>(stream_entry) += gpu_offset; - return stream_entry; - } - void release_interface(size_t index) { - size_t gpu_index = index / streams_per_gpu; - size_t stream_index = index % streams_per_gpu; - ref_counters[gpu_index]--; - std::make_heap(std::begin(priorities), std::end(priorities), - [this](const size_t &first, const size_t &second) -> bool { - return ref_counters[first] > ref_counters[second]; - }); - gpu_interfaces[gpu_index].release_interface(stream_index); - } - bool interface_available(size_t load_limit) { - return gpu_interfaces[priorities[0]].interface_available(load_limit); - } - size_t get_current_load() { - return gpu_interfaces[priorities[0]].get_current_load(); - } - size_t get_next_device_id() { return priorities[0]; } -}; - /// Access/Concurrency Control for stream pool implementation class stream_pool { public: @@ -262,8 +154,18 @@ class stream_pool { private: template <class Interface, class Pool> class stream_pool_implementation { public: + /// Deprecated! Use init_on_all_gpu or init_on_gpu template <typename... Ts> static void init(size_t number_of_streams, Ts ... executor_args) { + static_assert(max_number_gpus == 1, "deprecated stream_pool::init does not support multigpu"); + auto guard = make_scoped_lock_from_array(instance().gpu_mutexes); + instance().streampools.emplace_back(number_of_streams, + executor_args...); + } + + /// Multi-GPU init where executors / interfaces on all GPUs are initialized with the same arguments + template <typename... Ts> + static void init_on_all_gpus(size_t number_of_streams, Ts ... executor_args) { auto guard = make_scoped_lock_from_array(instance().gpu_mutexes); if (number_of_streams > 0) { for (size_t gpu_id = 0; gpu_id < max_number_gpus; gpu_id++) { @@ -274,36 +176,55 @@ class stream_pool { } } + /// Per-GPU init allowing for different init parameters depending on the GPU + /// (useful for executor that expect an GPU-id during construction) + template <typename... Ts> + static void init_on_gpu(size_t gpu_id, size_t number_of_streams, Ts ... executor_args) { + auto guard = make_scoped_lock_from_array(instance().gpu_mutexes); + if (number_of_streams > 0) { + instance().select_gpu_function(gpu_id); + instance().streampools.emplace_back(number_of_streams, + executor_args...); + } + } + // TODO add/rename into finalize? static void cleanup() { auto guard = make_scoped_lock_from_array(instance().gpu_mutexes); + assert(instance().streampools.size() == max_number_gpus); instance().streampools.clear(); } static std::tuple<Interface &, size_t> get_interface(const size_t gpu_id = 0) { std::lock_guard<mutex_t> guard(instance().gpu_mutexes[gpu_id]); + assert(instance().streampools.size() == max_number_gpus); return instance().streampools[gpu_id].get_interface(); } static void release_interface(size_t index, const size_t gpu_id = 0) { std::lock_guard<mutex_t> guard(instance().gpu_mutexes[gpu_id]); + assert(instance().streampools.size() == max_number_gpus); instance().streampools[gpu_id].release_interface(index); } static bool interface_available(size_t load_limit, const size_t gpu_id = 0) { std::lock_guard<mutex_t> guard(instance().gpu_mutexes[gpu_id]); + assert(instance().streampools.size() == max_number_gpus); return instance().streampools[gpu_id].interface_available(load_limit); } static size_t get_current_load(const size_t gpu_id = 0) { std::lock_guard<mutex_t> guard(instance().gpu_mutexes[gpu_id]); + assert(instance().streampools.size() == max_number_gpus); return instance().streampools[gpu_id].get_current_load(); } // TODO deprecated! Remove... static size_t get_next_device_id(const size_t gpu_id = 0) { std::lock_guard<mutex_t> guard(instance().gpu_mutexes[gpu_id]); + assert(instance().streampools.size() == max_number_gpus); return instance().streampools[gpu_id].get_next_device_id(); } static size_t set_device_selector(std::function<void(size_t)> select_gpu_function) { auto guard = make_scoped_lock_from_array(instance().gpu_mutexes); + assert(instance().streampools.size() == max_number_gpus); return instance().select_gpu_function = select_gpu_function; } diff --git a/tests/stream_test.cpp b/tests/stream_test.cpp index e7010d0f..96599759 100644 --- a/tests/stream_test.cpp +++ b/tests/stream_test.cpp @@ -31,30 +31,9 @@ int main(int argc, char *argv[]) { test_pool_ref_counting< hpx::cuda::experimental::cuda_executor, round_robin_pool<hpx::cuda::experimental::cuda_executor>>(2, 0, false); - test_pool_ref_counting< - hpx::cuda::experimental::cuda_executor, - multi_gpu_round_robin_pool< - hpx::cuda::experimental::cuda_executor, - round_robin_pool<hpx::cuda::experimental::cuda_executor>>>(2, 1, - false); - test_pool_ref_counting< - hpx::cuda::experimental::cuda_executor, - priority_pool_multi_gpu< - hpx::cuda::experimental::cuda_executor, - priority_pool<hpx::cuda::experimental::cuda_executor>>>(2, 1, false); - test_pool_ref_counting< - hpx::cuda::experimental::cuda_executor, - multi_gpu_round_robin_pool< - hpx::cuda::experimental::cuda_executor, - priority_pool<hpx::cuda::experimental::cuda_executor>>>(2, 1, false); - test_pool_ref_counting< - hpx::cuda::experimental::cuda_executor, - priority_pool_multi_gpu< - hpx::cuda::experimental::cuda_executor, - round_robin_pool<hpx::cuda::experimental::cuda_executor>>>(2, 1, - false); std::cout << "Finished ref counting tests!" << std::endl; + std::cout << "Starting wrapper objects tests ..." << std::endl; test_pool_wrappers<hpx::cuda::experimental::cuda_executor, priority_pool<hpx::cuda::experimental::cuda_executor>>( @@ -62,62 +41,16 @@ int main(int argc, char *argv[]) { test_pool_wrappers<hpx::cuda::experimental::cuda_executor, round_robin_pool<hpx::cuda::experimental::cuda_executor>>( 2, 0, false); - test_pool_wrappers< - hpx::cuda::experimental::cuda_executor, - multi_gpu_round_robin_pool< - hpx::cuda::experimental::cuda_executor, - round_robin_pool<hpx::cuda::experimental::cuda_executor>>>(2, 1, - false); - test_pool_wrappers< - hpx::cuda::experimental::cuda_executor, - priority_pool_multi_gpu< - hpx::cuda::experimental::cuda_executor, - priority_pool<hpx::cuda::experimental::cuda_executor>>>(2, 1, false); - - test_pool_wrappers< - hpx::cuda::experimental::cuda_executor, - multi_gpu_round_robin_pool< - hpx::cuda::experimental::cuda_executor, - priority_pool<hpx::cuda::experimental::cuda_executor>>>(2, 1, false); - test_pool_wrappers< - hpx::cuda::experimental::cuda_executor, - priority_pool_multi_gpu< - hpx::cuda::experimental::cuda_executor, - round_robin_pool<hpx::cuda::experimental::cuda_executor>>>(2, 1, - false); std::cout << "Finished wrapper objects tests!" << std::endl; std::cout << "Starting memcpy tests... " << std::endl; test_pool_memcpy<hpx::cuda::experimental::cuda_executor, round_robin_pool<hpx::cuda::experimental::cuda_executor>>( 2, 0, false); - test_pool_memcpy< - hpx::cuda::experimental::cuda_executor, - multi_gpu_round_robin_pool< - hpx::cuda::experimental::cuda_executor, - round_robin_pool<hpx::cuda::experimental::cuda_executor>>>(2, 1, - false); test_pool_memcpy<hpx::cuda::experimental::cuda_executor, priority_pool<hpx::cuda::experimental::cuda_executor>>( 2, 0, false); - test_pool_memcpy<hpx::cuda::experimental::cuda_executor, - priority_pool_multi_gpu< - hpx::cuda::experimental::cuda_executor, - priority_pool<hpx::cuda::experimental::cuda_executor>>>( - 2, 1, false); - // combo pool - test_pool_memcpy<hpx::cuda::experimental::cuda_executor, - multi_gpu_round_robin_pool< - hpx::cuda::experimental::cuda_executor, - priority_pool<hpx::cuda::experimental::cuda_executor>>>( - 2, 1, false); - test_pool_memcpy< - hpx::cuda::experimental::cuda_executor, - priority_pool_multi_gpu< - hpx::cuda::experimental::cuda_executor, - round_robin_pool<hpx::cuda::experimental::cuda_executor>>>(2, 1, - false); std::cout << "Finished memcpy tests! " << std::endl; std::cout << "Starting memcpy polling tests... " << std::endl; @@ -127,33 +60,9 @@ int main(int argc, char *argv[]) { test_pool_memcpy<hpx::cuda::experimental::cuda_executor, round_robin_pool<hpx::cuda::experimental::cuda_executor>>( 2, 0, true); - test_pool_memcpy< - hpx::cuda::experimental::cuda_executor, - multi_gpu_round_robin_pool< - hpx::cuda::experimental::cuda_executor, - round_robin_pool<hpx::cuda::experimental::cuda_executor>>>(2, 1, - true); test_pool_memcpy<hpx::cuda::experimental::cuda_executor, priority_pool<hpx::cuda::experimental::cuda_executor>>( 2, 0, true); - test_pool_memcpy< - hpx::cuda::experimental::cuda_executor, - priority_pool_multi_gpu< - hpx::cuda::experimental::cuda_executor, - priority_pool<hpx::cuda::experimental::cuda_executor>>>(2, 1, true); - - // combo pool - test_pool_memcpy< - hpx::cuda::experimental::cuda_executor, - multi_gpu_round_robin_pool< - hpx::cuda::experimental::cuda_executor, - priority_pool<hpx::cuda::experimental::cuda_executor>>>(2, 1, true); - test_pool_memcpy< - hpx::cuda::experimental::cuda_executor, - priority_pool_multi_gpu< - hpx::cuda::experimental::cuda_executor, - round_robin_pool<hpx::cuda::experimental::cuda_executor>>>(2, 1, - true); hpx::cuda::experimental::detail::unregister_polling(hpx::resource::get_thread_pool(0)); } recycler::force_cleanup(); From 093dce2c2d18c752868c4d39488167bdff2f0dee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= <Gregor.Daiss+git@gmail.com> Date: Mon, 12 Jun 2023 16:40:22 -0500 Subject: [PATCH 07/42] Add select device functor --- include/aligned_buffer_util.hpp | 8 ++++++++ include/buffer_manager.hpp | 18 +++++++++++++++++- include/cuda_buffer_util.hpp | 15 +++++++++++++++ 3 files changed, 40 insertions(+), 1 deletion(-) diff --git a/include/aligned_buffer_util.hpp b/include/aligned_buffer_util.hpp index 456420bc..d36a994a 100644 --- a/include/aligned_buffer_util.hpp +++ b/include/aligned_buffer_util.hpp @@ -10,6 +10,14 @@ #include <boost/align/aligned_allocator.hpp> namespace recycler { +namespace device_selection { +template <typename T, size_t alignement> +struct select_device_functor< + T, boost::alignment::aligned_allocator<T, alignement>> { + void operator()(const size_t device_id) {} +}; +} // namespace device_selection + template <typename T, std::size_t alignement, std::enable_if_t<std::is_trivial<T>::value, int> = 0> using recycle_aligned = detail::recycle_allocator< diff --git a/include/buffer_manager.hpp b/include/buffer_manager.hpp index 8907a0e5..31a837ed 100644 --- a/include/buffer_manager.hpp +++ b/include/buffer_manager.hpp @@ -42,10 +42,25 @@ For better performance configure CPPuddle with CPPUDDLE_WITH_HPX_AWARE_ALLOCATOR #include "../include/detail/config.hpp" namespace recycler { + +namespace device_selection { +template <typename T, typename Allocator> struct select_device_functor { + void operator()(const size_t device_id) { + if constexpr (max_number_gpus > 1) + throw std::runtime_error( + "Allocators used in Multi-GPU builds need explicit Multi-GPU support " + "(by having a select_device_functor overload"); + } +}; +template <typename T> struct select_device_functor<T, std::allocator<T>> { + void operator()(const size_t device_id) {} +}; +} // namespace device_selection + namespace detail { + class buffer_recycler { - // Public interface public: #if defined(CPPUDDLE_DEACTIVATE_BUFFER_RECYCLING) @@ -239,6 +254,7 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL // No unused buffer found -> Create new one and return it try { + recycler::device_selection::select_device_functor<T, Host_Allocator>{}(location_id / number_instances); Host_Allocator alloc; T *buffer = alloc.allocate(number_of_elements); instance()[location_id].buffer_map.insert( diff --git a/include/cuda_buffer_util.hpp b/include/cuda_buffer_util.hpp index bbe01349..587eea97 100644 --- a/include/cuda_buffer_util.hpp +++ b/include/cuda_buffer_util.hpp @@ -17,6 +17,8 @@ namespace recycler { namespace detail { + + template <class T, bool auto_select_device = true> struct cuda_pinned_allocator { using value_type = T; cuda_pinned_allocator() noexcept = default; @@ -48,6 +50,7 @@ template <class T, bool auto_select_device = true> struct cuda_pinned_allocator } } }; + template <class T, class U> constexpr bool operator==(cuda_pinned_allocator<T> const &, cuda_pinned_allocator<U> const &) noexcept { @@ -100,6 +103,7 @@ constexpr bool operator!=(cuda_device_allocator<T, auto_select_T> const &, return false; } + } // end namespace detail template <typename T, bool auto_select_device = true, std::enable_if_t<std::is_trivial<T>::value, int> = 0> @@ -172,5 +176,16 @@ struct cuda_aggregated_device_buffer { Host_Allocator &alloc; }; +namespace device_selection { +template <typename T> +struct select_device_functor<T, detail::cuda_pinned_allocator<T>> { + void operator()(const size_t device_id) { cudaSetDevice(get_device_id()); } +}; +template <typename T> +struct select_device_functor<T, detail::cuda_device_allocator<T>> { + void operator()(const size_t device_id) { cudaSetDevice(get_device_id()); } +}; +} // namespace device_selection + } // end namespace recycler #endif From cd93da1beb9116288a0d0c0c6929c74ce99003ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= <Gregor.Daiss+git@gmail.com> Date: Mon, 12 Jun 2023 16:43:19 -0500 Subject: [PATCH 08/42] Remove previous attempt at device selector --- include/cuda_buffer_util.hpp | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/include/cuda_buffer_util.hpp b/include/cuda_buffer_util.hpp index 587eea97..431876f2 100644 --- a/include/cuda_buffer_util.hpp +++ b/include/cuda_buffer_util.hpp @@ -19,14 +19,12 @@ namespace detail { -template <class T, bool auto_select_device = true> struct cuda_pinned_allocator { +template <class T> struct cuda_pinned_allocator { using value_type = T; cuda_pinned_allocator() noexcept = default; template <class U> explicit cuda_pinned_allocator(cuda_pinned_allocator<U> const &) noexcept {} T *allocate(std::size_t n) { - if constexpr (auto_select_device) - cudaSetDevice(get_device_id()); T *data; cudaError_t error = cudaMallocHost(reinterpret_cast<void **>(&data), n * sizeof(T)); @@ -62,14 +60,12 @@ constexpr bool operator!=(cuda_pinned_allocator<T> const &, return false; } -template <class T, bool auto_select_device = true> struct cuda_device_allocator { +template <class T> struct cuda_device_allocator { using value_type = T; cuda_device_allocator() noexcept = default; template <class U> explicit cuda_device_allocator(cuda_device_allocator<U> const &) noexcept {} T *allocate(std::size_t n) { - if constexpr (auto_select_device) - cudaSetDevice(get_device_id()); T *data; cudaError_t error = cudaMalloc(&data, n * sizeof(T)); if (error != cudaSuccess) { @@ -92,26 +88,26 @@ template <class T, bool auto_select_device = true> struct cuda_device_allocator } } }; -template <class T, class U, bool auto_select_T, bool auto_select_U> -constexpr bool operator==(cuda_device_allocator<T, auto_select_T> const &, - cuda_device_allocator<U, auto_select_U> const &) noexcept { +template <class T, class U> +constexpr bool operator==(cuda_device_allocator<T> const &, + cuda_device_allocator<U> const &) noexcept { return true; } -template <class T, class U, bool auto_select_T, bool auto_select_U> -constexpr bool operator!=(cuda_device_allocator<T, auto_select_T> const &, - cuda_device_allocator<U, auto_select_U> const &) noexcept { +template <class T, class U> +constexpr bool operator!=(cuda_device_allocator<T> const &, + cuda_device_allocator<U> const &) noexcept { return false; } } // end namespace detail -template <typename T, bool auto_select_device = true, std::enable_if_t<std::is_trivial<T>::value, int> = 0> +template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0> using recycle_allocator_cuda_host = - detail::aggressive_recycle_allocator<T, detail::cuda_pinned_allocator<T, auto_select_device>>; -template <typename T, bool auto_select_device = true, std::enable_if_t<std::is_trivial<T>::value, int> = 0> + detail::aggressive_recycle_allocator<T, detail::cuda_pinned_allocator<T>>; +template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0> using recycle_allocator_cuda_device = - detail::recycle_allocator<T, detail::cuda_device_allocator<T, auto_select_device>>; + detail::recycle_allocator<T, detail::cuda_device_allocator<T>>; template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0> struct cuda_device_buffer { From 13847d983458a5db271dc741cecc21bc420ef3ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= <Gregor.Daiss+git@gmail.com> Date: Mon, 12 Jun 2023 17:51:57 -0500 Subject: [PATCH 09/42] Sync --- include/buffer_manager.hpp | 26 ++++++++++++++++++++++++++ include/kokkos_buffer_util.hpp | 29 +++++++++++++++++++++++++---- 2 files changed, 51 insertions(+), 4 deletions(-) diff --git a/include/buffer_manager.hpp b/include/buffer_manager.hpp index 31a837ed..ad42698a 100644 --- a/include/buffer_manager.hpp +++ b/include/buffer_manager.hpp @@ -515,6 +515,12 @@ template <typename T, typename Host_Allocator> struct recycle_allocator { T *data = buffer_recycler::get<T, Host_Allocator>(n); return data; } + T *allocate(std::size_t n, std::size_t location_id) { + assert(location_id == 0); + T *data = buffer_recycler::get<T, Host_Allocator>( + n, true, location_id); // also initializes the buffer if it isn't reused + return data; + } void deallocate(T *p, std::size_t n) { buffer_recycler::mark_unused<T, Host_Allocator>(p, n); } @@ -531,6 +537,13 @@ template <typename T, typename Host_Allocator> struct recycle_allocator { n, false, hpx::get_worker_thread_num()); return data; } + T *allocate(std::size_t n, std::size_t location_id) { + assert(location_id >= 0 && location_id < number_instances); + T *data = buffer_recycler::get<T, Host_Allocator>( + n, true, location_id); // also initializes the buffer + // if it isn't reused + return data; + } void deallocate(T *p, std::size_t n) { buffer_recycler::mark_unused<T, Host_Allocator>(p, n, dealloc_hint); } @@ -580,6 +593,12 @@ struct aggressive_recycle_allocator { n, true); // also initializes the buffer if it isn't reused return data; } + T *allocate(std::size_t n, std::size_t location_id) { + assert(location_id == 0); + T *data = buffer_recycler::get<T, Host_Allocator>( + n, true, location_id); // also initializes the buffer if it isn't reused + return data; + } void deallocate(T *p, std::size_t n) { buffer_recycler::mark_unused<T, Host_Allocator>(p, n); } @@ -597,6 +616,13 @@ struct aggressive_recycle_allocator { // if it isn't reused return data; } + T *allocate(std::size_t n, std::size_t location_id) { + assert(location_id >= 0 && location_id < number_instances); + T *data = buffer_recycler::get<T, Host_Allocator>( + n, true, location_id); // also initializes the buffer + // if it isn't reused + return data; + } void deallocate(T *p, std::size_t n) { buffer_recycler::mark_unused<T, Host_Allocator>(p, n, dealloc_hint); } diff --git a/include/kokkos_buffer_util.hpp b/include/kokkos_buffer_util.hpp index f78f6bbb..3d8646a8 100644 --- a/include/kokkos_buffer_util.hpp +++ b/include/kokkos_buffer_util.hpp @@ -80,7 +80,7 @@ class aggregated_recycled_view : public kokkos_type { template <typename kokkos_type, typename alloc_type, typename element_type> class recycled_view : public kokkos_type { private: - static alloc_type allocator; + alloc_type allocator; size_t total_elements{0}; std::shared_ptr<element_type> data_ref_counter; @@ -97,6 +97,28 @@ class recycled_view : public kokkos_type { data_ref_counter(this->data(), view_deleter<element_type, alloc_type>( allocator, total_elements)) {} + template <class... Args> + explicit recycled_view(std::size_t location_id, alloc_type alloc, Args... args) + : allocator(alloc), kokkos_type( + allocator.allocate(kokkos_type::required_allocation_size(args...) / + sizeof(element_type), location_id), + args...), + total_elements(kokkos_type::required_allocation_size(args...) / + sizeof(element_type)), + data_ref_counter(this->data(), view_deleter<element_type, alloc_type>( + allocator, total_elements)) {} + + template <bool use_custom_location_id, class... Args> + explicit recycled_view(std::size_t location_id, alloc_type alloc, Args... args) + : allocator(alloc), kokkos_type( + allocator.allocate(kokkos_type::required_allocation_size(args...) / + sizeof(element_type), location_id), + args...), + total_elements(kokkos_type::required_allocation_size(args...) / + sizeof(element_type)), + data_ref_counter(this->data(), view_deleter<element_type, alloc_type>( + allocator, total_elements)) {} + recycled_view( const recycled_view<kokkos_type, alloc_type, element_type> &other) : kokkos_type(other) { @@ -110,7 +132,6 @@ class recycled_view : public kokkos_type { data_ref_counter = other.data_ref_counter; kokkos_type::operator=(other); total_elements = other.total_elements; - allocator.increase_usage_counter(other.data(), other.total_elements); return *this; } @@ -132,8 +153,8 @@ class recycled_view : public kokkos_type { ~recycled_view() { } }; -template <class kokkos_type, class alloc_type, class element_type> -alloc_type recycled_view<kokkos_type, alloc_type, element_type>::allocator; +/* template <class kokkos_type, class alloc_type, class element_type> */ +/* alloc_type recycled_view<kokkos_type, alloc_type, element_type>::allocator; */ } // end namespace recycler From 7b9e4a30cb949ec2cf9d27f5c921bd5cc0d0fe8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= <Gregor.Daiss+git@gmail.com> Date: Tue, 13 Jun 2023 16:16:32 +0200 Subject: [PATCH 10/42] Add location specific view constructors --- include/buffer_manager.hpp | 30 +++-------------------- include/kokkos_buffer_util.hpp | 43 +++++++++++++++++++++------------ tests/allocator_kokkos_test.cpp | 1 + 3 files changed, 32 insertions(+), 42 deletions(-) diff --git a/include/buffer_manager.hpp b/include/buffer_manager.hpp index ad42698a..5325bffb 100644 --- a/include/buffer_manager.hpp +++ b/include/buffer_manager.hpp @@ -501,6 +501,8 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL template <typename T, typename Host_Allocator> struct recycle_allocator { using value_type = T; + using underlying_allocator_type = Host_Allocator; + static_assert(std::is_same_v<value_type, typename underlying_allocator_type::value_type>); const std::optional<size_t> dealloc_hint; #ifndef CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS @@ -515,12 +517,6 @@ template <typename T, typename Host_Allocator> struct recycle_allocator { T *data = buffer_recycler::get<T, Host_Allocator>(n); return data; } - T *allocate(std::size_t n, std::size_t location_id) { - assert(location_id == 0); - T *data = buffer_recycler::get<T, Host_Allocator>( - n, true, location_id); // also initializes the buffer if it isn't reused - return data; - } void deallocate(T *p, std::size_t n) { buffer_recycler::mark_unused<T, Host_Allocator>(p, n); } @@ -537,13 +533,6 @@ template <typename T, typename Host_Allocator> struct recycle_allocator { n, false, hpx::get_worker_thread_num()); return data; } - T *allocate(std::size_t n, std::size_t location_id) { - assert(location_id >= 0 && location_id < number_instances); - T *data = buffer_recycler::get<T, Host_Allocator>( - n, true, location_id); // also initializes the buffer - // if it isn't reused - return data; - } void deallocate(T *p, std::size_t n) { buffer_recycler::mark_unused<T, Host_Allocator>(p, n, dealloc_hint); } @@ -578,6 +567,8 @@ operator!=(recycle_allocator<T, Host_Allocator> const &, template <typename T, typename Host_Allocator> struct aggressive_recycle_allocator { using value_type = T; + using underlying_allocator_type = Host_Allocator; + static_assert(std::is_same_v<value_type, typename underlying_allocator_type::value_type>); std::optional<size_t> dealloc_hint; #ifndef CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS @@ -593,12 +584,6 @@ struct aggressive_recycle_allocator { n, true); // also initializes the buffer if it isn't reused return data; } - T *allocate(std::size_t n, std::size_t location_id) { - assert(location_id == 0); - T *data = buffer_recycler::get<T, Host_Allocator>( - n, true, location_id); // also initializes the buffer if it isn't reused - return data; - } void deallocate(T *p, std::size_t n) { buffer_recycler::mark_unused<T, Host_Allocator>(p, n); } @@ -616,13 +601,6 @@ struct aggressive_recycle_allocator { // if it isn't reused return data; } - T *allocate(std::size_t n, std::size_t location_id) { - assert(location_id >= 0 && location_id < number_instances); - T *data = buffer_recycler::get<T, Host_Allocator>( - n, true, location_id); // also initializes the buffer - // if it isn't reused - return data; - } void deallocate(T *p, std::size_t n) { buffer_recycler::mark_unused<T, Host_Allocator>(p, n, dealloc_hint); } diff --git a/include/kokkos_buffer_util.hpp b/include/kokkos_buffer_util.hpp index 3d8646a8..52413cce 100644 --- a/include/kokkos_buffer_util.hpp +++ b/include/kokkos_buffer_util.hpp @@ -7,6 +7,8 @@ #define KOKKOS_BUFFER_UTIL_HPP #include <Kokkos_Core.hpp> #include <memory> +#include <buffer_manager.hpp> +#include <type_traits> namespace recycler { @@ -86,8 +88,10 @@ class recycled_view : public kokkos_type { public: using view_type = kokkos_type; - template <class... Args> - explicit recycled_view(Args... args) + static_assert(std::is_same_v<element_type, typename alloc_type::value_type>); + template <typename... Args, + std::enable_if_t<sizeof...(Args) == kokkos_type::rank, bool> = true> + recycled_view(Args... args) : kokkos_type( allocator.allocate(kokkos_type::required_allocation_size(args...) / sizeof(element_type)), @@ -97,24 +101,33 @@ class recycled_view : public kokkos_type { data_ref_counter(this->data(), view_deleter<element_type, alloc_type>( allocator, total_elements)) {} - template <class... Args> - explicit recycled_view(std::size_t location_id, alloc_type alloc, Args... args) - : allocator(alloc), kokkos_type( - allocator.allocate(kokkos_type::required_allocation_size(args...) / - sizeof(element_type), location_id), + template <typename... Args, + std::enable_if_t<sizeof...(Args) == kokkos_type::rank, bool> = true> + recycled_view(std::size_t location_id, Args... args) + : kokkos_type( + detail::buffer_recycler::get< + element_type, typename alloc_type::underlying_allocator_type>( + kokkos_type::required_allocation_size(args...) / + sizeof(element_type), + false, location_id), args...), total_elements(kokkos_type::required_allocation_size(args...) / sizeof(element_type)), data_ref_counter(this->data(), view_deleter<element_type, alloc_type>( allocator, total_elements)) {} - template <bool use_custom_location_id, class... Args> - explicit recycled_view(std::size_t location_id, alloc_type alloc, Args... args) - : allocator(alloc), kokkos_type( - allocator.allocate(kokkos_type::required_allocation_size(args...) / - sizeof(element_type), location_id), - args...), - total_elements(kokkos_type::required_allocation_size(args...) / + template < + typename layout_t, + std::enable_if_t<Kokkos::is_array_layout<layout_t>::value, bool> = true> + recycled_view(std::size_t location_id, layout_t layout) + : kokkos_type( + detail::buffer_recycler::get< + element_type, typename alloc_type::underlying_allocator_type>( + kokkos_type::required_allocation_size(layout) / + sizeof(element_type), + false, location_id), + layout), + total_elements(kokkos_type::required_allocation_size(layout) / sizeof(element_type)), data_ref_counter(this->data(), view_deleter<element_type, alloc_type>( allocator, total_elements)) {} @@ -153,8 +166,6 @@ class recycled_view : public kokkos_type { ~recycled_view() { } }; -/* template <class kokkos_type, class alloc_type, class element_type> */ -/* alloc_type recycled_view<kokkos_type, alloc_type, element_type>::allocator; */ } // end namespace recycler diff --git a/tests/allocator_kokkos_test.cpp b/tests/allocator_kokkos_test.cpp index de808859..e0c7fb55 100644 --- a/tests/allocator_kokkos_test.cpp +++ b/tests/allocator_kokkos_test.cpp @@ -81,6 +81,7 @@ int main(int argc, char *argv[]) { for (size_t pass = 0; pass < passes; pass++) { test_view my_wrapper_test1(1000); test_view my_wrapper_test2(1000); + test_view my_wrapper_test3(127, 1000); // test 1D with location id double t = 2.6; Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::Serial>(0, 1000), KOKKOS_LAMBDA(const int n) { From d8a887108e403a47ad8bf2e9f51599674125d390 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= <Gregor.Daiss+git@gmail.com> Date: Tue, 13 Jun 2023 12:20:59 -0500 Subject: [PATCH 11/42] Fix cmake and interface change --- CMakeLists.txt | 14 ++++++++++---- include/stream_manager.hpp | 4 ++-- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5801e499..5b6bd617 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -170,6 +170,9 @@ if (CPPUDDLE_WITH_HPX) target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_MAX_NUMBER_GPUS=1") target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_MAX_NUMBER_WORKERS=1") endif() +else() + target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_MAX_NUMBER_GPUS=1") + target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_MAX_NUMBER_WORKERS=1") endif() if (CPPUDDLE_WITH_COUNTERS) target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_HAVE_COUNTERS") @@ -184,12 +187,15 @@ if (CPPUDDLE_WITH_HPX) target_compile_definitions(stream_manager INTERFACE "CPPUDDLE_HAVE_HPX") if(CPPUDDLE_WITH_HPX_AWARE_ALLOCATORS) target_compile_definitions(stream_manager INTERFACE "CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS") - target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_MAX_NUMBER_GPUS=${CPPUDDLE_WITH_NUMBER_GPUS}") - target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_MAX_NUMBER_WORKERS=${CPPUDDLE_WITH_MAX_NUMBER_WORKERS}") + target_compile_definitions(stream_manager INTERFACE "CPPUDDLE_MAX_NUMBER_GPUS=${CPPUDDLE_WITH_NUMBER_GPUS}") + target_compile_definitions(stream_manager INTERFACE "CPPUDDLE_MAX_NUMBER_WORKERS=${CPPUDDLE_WITH_MAX_NUMBER_WORKERS}") else() - target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_MAX_NUMBER_GPUS=1") - target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_MAX_NUMBER_WORKERS=1") + target_compile_definitions(stream_manager INTERFACE "CPPUDDLE_MAX_NUMBER_GPUS=1") + target_compile_definitions(stream_manager INTERFACE "CPPUDDLE_MAX_NUMBER_WORKERS=1") endif() +else() + target_compile_definitions(stream_manager INTERFACE "CPPUDDLE_MAX_NUMBER_GPUS=1") + target_compile_definitions(stream_manager INTERFACE "CPPUDDLE_MAX_NUMBER_WORKERS=1") endif() if (CPPUDDLE_WITH_COUNTERS) target_compile_definitions(stream_manager INTERFACE "CPPUDDLE_HAVE_COUNTERS") diff --git a/include/stream_manager.hpp b/include/stream_manager.hpp index ec1eeeda..22997e62 100644 --- a/include/stream_manager.hpp +++ b/include/stream_manager.hpp @@ -165,7 +165,7 @@ class stream_pool { /// Multi-GPU init where executors / interfaces on all GPUs are initialized with the same arguments template <typename... Ts> - static void init_on_all_gpus(size_t number_of_streams, Ts ... executor_args) { + static void init_all_executor_pools(size_t number_of_streams, Ts ... executor_args) { auto guard = make_scoped_lock_from_array(instance().gpu_mutexes); if (number_of_streams > 0) { for (size_t gpu_id = 0; gpu_id < max_number_gpus; gpu_id++) { @@ -179,7 +179,7 @@ class stream_pool { /// Per-GPU init allowing for different init parameters depending on the GPU /// (useful for executor that expect an GPU-id during construction) template <typename... Ts> - static void init_on_gpu(size_t gpu_id, size_t number_of_streams, Ts ... executor_args) { + static void init_executor_pool(size_t gpu_id, size_t number_of_streams, Ts ... executor_args) { auto guard = make_scoped_lock_from_array(instance().gpu_mutexes); if (number_of_streams > 0) { instance().select_gpu_function(gpu_id); From 7f7e9eac65d077e0ad4308b0f97c667c1a92deb6 Mon Sep 17 00:00:00 2001 From: Gregor Daiss <Gregor.Daiss+git@gmail.com> Date: Mon, 19 Jun 2023 13:40:47 -0500 Subject: [PATCH 12/42] Fix multigpu build --- include/aggregation_manager.hpp | 32 +++++++++++++++------------- include/buffer_manager.hpp | 5 ++++- include/cuda_buffer_util.hpp | 4 ++-- include/stream_manager.hpp | 37 ++++++++++++++++++++------------- 4 files changed, 47 insertions(+), 31 deletions(-) diff --git a/include/aggregation_manager.hpp b/include/aggregation_manager.hpp index 2682a55c..4bc4dbb7 100644 --- a/include/aggregation_manager.hpp +++ b/include/aggregation_manager.hpp @@ -389,6 +389,7 @@ template <typename Executor> class Aggregated_Executor { Executor &executor; public: + size_t gpu_id; // Subclasses /// Slice class - meant as a scope interface to the aggregated executor @@ -559,13 +560,14 @@ template <typename Executor> class Aggregated_Executor { // Default location -- useful for GPU builds as we otherwise create way too // many different buffers for different aggregation sizes on different GPUs - size_t location_id = (hpx::get_worker_thread_num() / instances_per_gpu) * instances_per_gpu; + size_t location_id = gpu_id * instances_per_gpu; #ifdef CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS if (max_slices == 1) { // get prefered location: aka the current hpx threads location // Usually handy for CPU builds where we want to use the buffers // close to the current CPU core - location_id = (hpx::get_worker_thread_num() / instances_per_gpu) * instances_per_gpu; + /* location_id = (hpx::get_worker_thread_num() / instances_per_gpu) * instances_per_gpu; */ + location_id = (gpu_id) * instances_per_gpu; // division makes sure that we always use the same instance to store our gpu buffers. } #endif @@ -914,11 +916,11 @@ template <typename Executor> class Aggregated_Executor { } Aggregated_Executor(const size_t number_slices, - Aggregated_Executor_Modes mode) + Aggregated_Executor_Modes mode, const size_t gpu_id) : max_slices(number_slices), current_slices(0), slices_exhausted(false),dealloc_counter(0), - mode(mode), executor_slices_alive(false), buffers_in_use(false), + mode(mode), executor_slices_alive(false), buffers_in_use(false), gpu_id(gpu_id), executor_tuple( - stream_pool::get_interface<Executor, round_robin_pool<Executor>>()), + stream_pool::get_interface<Executor, round_robin_pool<Executor>>(gpu_id)), executor(std::get<0>(executor_tuple)), current_continuation(hpx::make_ready_future()), last_stream_launch_done(hpx::make_ready_future()) {} @@ -1009,20 +1011,22 @@ class aggregation_pool { template <typename... Ts> static void init(size_t number_of_executors, size_t slices_per_executor, Aggregated_Executor_Modes mode) { - const size_t gpu_id = get_device_id(); - std::lock_guard<aggregation_mutex_t> guard(instance()[gpu_id].pool_mutex); - assert(instance()[gpu_id].aggregation_executor_pool.empty()); - for (int i = 0; i < number_of_executors; i++) { - instance()[gpu_id].aggregation_executor_pool.emplace_back(slices_per_executor, - mode); + for (size_t gpu_id = 0; gpu_id < max_number_gpus; gpu_id++) { + std::lock_guard<aggregation_mutex_t> guard(instance()[gpu_id].pool_mutex); + assert(instance()[gpu_id].aggregation_executor_pool.empty()); + for (int i = 0; i < number_of_executors; i++) { + instance()[gpu_id].aggregation_executor_pool.emplace_back(slices_per_executor, + mode, gpu_id); + } + instance()[gpu_id].slices_per_executor = slices_per_executor; + instance()[gpu_id].mode = mode; } - instance()[gpu_id].slices_per_executor = slices_per_executor; - instance()[gpu_id].mode = mode; } /// Will always return a valid executor slice static decltype(auto) request_executor_slice(void) { const size_t gpu_id = get_device_id(); + /* const size_t gpu_id = 1; */ std::lock_guard<aggregation_mutex_t> guard(instance()[gpu_id].pool_mutex); assert(!instance()[gpu_id].aggregation_executor_pool.empty()); std::optional<hpx::lcos::future< @@ -1052,7 +1056,7 @@ class aggregation_pool { // return empty optional if (instance()[gpu_id].growing_pool) { instance()[gpu_id].aggregation_executor_pool.emplace_back( - instance()[gpu_id].slices_per_executor, instance()[gpu_id].mode); + instance()[gpu_id].slices_per_executor, instance()[gpu_id].mode, gpu_id); instance()[gpu_id].current_interface = instance()[gpu_id].aggregation_executor_pool.size() - 1; assert(instance()[gpu_id].aggregation_executor_pool.size() < 20480); diff --git a/include/buffer_manager.hpp b/include/buffer_manager.hpp index 5325bffb..61254efe 100644 --- a/include/buffer_manager.hpp +++ b/include/buffer_manager.hpp @@ -254,7 +254,8 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL // No unused buffer found -> Create new one and return it try { - recycler::device_selection::select_device_functor<T, Host_Allocator>{}(location_id / number_instances); + recycler::device_selection::select_device_functor<T, Host_Allocator>{}( + location_id / instances_per_gpu); Host_Allocator alloc; T *buffer = alloc.allocate(number_of_elements); instance()[location_id].buffer_map.insert( @@ -278,6 +279,8 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL // If there still isn't enough memory left, the caller has to handle it // We've done all we can in here Host_Allocator alloc; + recycler::device_selection::select_device_functor<T, Host_Allocator>{}( + location_id / instances_per_gpu); T *buffer = alloc.allocate(number_of_elements); instance()[location_id].buffer_map.insert( {buffer, std::make_tuple(buffer, number_of_elements, 1, diff --git a/include/cuda_buffer_util.hpp b/include/cuda_buffer_util.hpp index 431876f2..8fb57d4f 100644 --- a/include/cuda_buffer_util.hpp +++ b/include/cuda_buffer_util.hpp @@ -175,11 +175,11 @@ struct cuda_aggregated_device_buffer { namespace device_selection { template <typename T> struct select_device_functor<T, detail::cuda_pinned_allocator<T>> { - void operator()(const size_t device_id) { cudaSetDevice(get_device_id()); } + void operator()(const size_t device_id) { cudaSetDevice(device_id); } }; template <typename T> struct select_device_functor<T, detail::cuda_device_allocator<T>> { - void operator()(const size_t device_id) { cudaSetDevice(get_device_id()); } + void operator()(const size_t device_id) { cudaSetDevice(device_id); } }; } // namespace device_selection diff --git a/include/stream_manager.hpp b/include/stream_manager.hpp index 22997e62..38f2f31d 100644 --- a/include/stream_manager.hpp +++ b/include/stream_manager.hpp @@ -44,6 +44,7 @@ template <class Interface> class round_robin_pool { } // return a tuple with the interface and its index (to release it later) std::tuple<Interface &, size_t> get_interface() { + assert(!(pool.empty())); size_t last_interface = current_interface; current_interface = (current_interface + 1) % pool.size(); ref_counters[last_interface]++; @@ -114,18 +115,28 @@ class stream_pool { static void init(size_t number_of_streams, Ts ... executor_args) { stream_pool_implementation<Interface, Pool>::init(number_of_streams, executor_args...); -} + } + template <class Interface, class Pool, typename... Ts> + static void init_all_executor_pools(size_t number_of_streams, Ts ... executor_args) { + stream_pool_implementation<Interface, Pool>::init_all_executor_pools(number_of_streams, + executor_args...); + } + template <class Interface, class Pool, typename... Ts> + static void init_executor_pool(size_t pool_id, size_t number_of_streams, Ts ... executor_args) { + stream_pool_implementation<Interface, Pool>::init_executor_pool(pool_id, number_of_streams, + executor_args...); + } template <class Interface, class Pool> static void cleanup() { stream_pool_implementation<Interface, Pool>::cleanup(); } template <class Interface, class Pool> - static std::tuple<Interface &, size_t> get_interface() { - return stream_pool_implementation<Interface, Pool>::get_interface(get_device_id()); + static std::tuple<Interface &, size_t> get_interface(const size_t gpu_id = get_device_id()) { + return stream_pool_implementation<Interface, Pool>::get_interface(gpu_id); } template <class Interface, class Pool> - static void release_interface(size_t index) noexcept { + static void release_interface(size_t index, const size_t gpu_id = get_device_id()) noexcept { stream_pool_implementation<Interface, Pool>::release_interface(index, - get_device_id()); + gpu_id); } template <class Interface, class Pool> static bool interface_available(size_t load_limit) noexcept { @@ -144,8 +155,8 @@ class stream_pool { } template <class Interface, class Pool> - static size_t set_device_selector(std::function<void(size_t)> select_gpu_function) { - return stream_pool_implementation<Interface, Pool>::set_device_selector(select_gpu_function); + static void set_device_selector(std::function<void(size_t)> select_gpu_function) { + stream_pool_implementation<Interface, Pool>::set_device_selector(select_gpu_function); } private: @@ -157,10 +168,10 @@ class stream_pool { /// Deprecated! Use init_on_all_gpu or init_on_gpu template <typename... Ts> static void init(size_t number_of_streams, Ts ... executor_args) { - static_assert(max_number_gpus == 1, "deprecated stream_pool::init does not support multigpu"); + /* static_assert(sizeof...(Ts) == sizeof...(Ts) && max_number_gpus == 1, */ + /* "deprecated stream_pool::init does not support multigpu"); */ auto guard = make_scoped_lock_from_array(instance().gpu_mutexes); - instance().streampools.emplace_back(number_of_streams, - executor_args...); + instance().streampools.emplace_back(number_of_streams, executor_args...); } /// Multi-GPU init where executors / interfaces on all GPUs are initialized with the same arguments @@ -222,10 +233,10 @@ class stream_pool { return instance().streampools[gpu_id].get_next_device_id(); } - static size_t set_device_selector(std::function<void(size_t)> select_gpu_function) { + static void set_device_selector(std::function<void(size_t)> select_gpu_function) { auto guard = make_scoped_lock_from_array(instance().gpu_mutexes); assert(instance().streampools.size() == max_number_gpus); - return instance().select_gpu_function = select_gpu_function; + instance().select_gpu_function = select_gpu_function; } private: @@ -290,8 +301,6 @@ template <class Interface, class Pool> class stream_interface { return interface.async_execute(std::forward<F>(f), std::forward<Ts>(ts)...); } - inline size_t get_gpu_id() noexcept { return interface.get_gpu_id(); } - // allow implict conversion operator Interface &() { // NOLINT return interface; From c39ee74ea6db2f220ecffc26b4aa9045cae2fee3 Mon Sep 17 00:00:00 2001 From: Gregor Daiss <Gregor.Daiss+git@gmail.com> Date: Tue, 20 Jun 2023 11:29:02 -0500 Subject: [PATCH 13/42] Use spinlocks without backoff --- include/detail/config.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/detail/config.hpp b/include/detail/config.hpp index 4c8eca37..f3717411 100644 --- a/include/detail/config.hpp +++ b/include/detail/config.hpp @@ -10,8 +10,8 @@ // #if defined(CPPUDDLE_HAVE_HPX) && defined(CPPUDDLE_HAVE_HPX_MUTEX) #include <hpx/mutex.hpp> -using mutex_t = hpx::spinlock; -using aggregation_mutex_t = hpx::spinlock; +using mutex_t = hpx::spinlock_no_backoff; +using aggregation_mutex_t = hpx::spinlock_no_backoff; #else #include <mutex> using mutex_t = std::mutex; From 93f2d6060f6e00003de3b053c6004fef841b6e16 Mon Sep 17 00:00:00 2001 From: Gregor Daiss <Gregor.Daiss+git@gmail.com> Date: Thu, 22 Jun 2023 22:04:25 -0500 Subject: [PATCH 14/42] Add select gpu function --- include/aggregation_manager.hpp | 1 + include/stream_manager.hpp | 9 +++++++++ 2 files changed, 10 insertions(+) diff --git a/include/aggregation_manager.hpp b/include/aggregation_manager.hpp index 4bc4dbb7..0269b527 100644 --- a/include/aggregation_manager.hpp +++ b/include/aggregation_manager.hpp @@ -816,6 +816,7 @@ template <typename Executor> class Aggregated_Executor { // Fallback launch condidtion: Launch as soon as the underlying stream // is ready /* auto slices_full_fut = slices_full_promise.get_future(); */ + stream_pool::select_device<Executor, round_robin_pool<Executor>>(gpu_id); auto exec_fut = executor.get_future(); /* fut = hpx::when_any(exec_fut, slices_full_fut); */ fut = std::move(exec_fut); diff --git a/include/stream_manager.hpp b/include/stream_manager.hpp index 38f2f31d..a10229a2 100644 --- a/include/stream_manager.hpp +++ b/include/stream_manager.hpp @@ -159,6 +159,11 @@ class stream_pool { stream_pool_implementation<Interface, Pool>::set_device_selector(select_gpu_function); } + template <class Interface, class Pool> + static void select_device(size_t gpu_id) { + stream_pool_implementation<Interface, Pool>::select_device(gpu_id); + } + private: stream_pool() = default; @@ -239,6 +244,10 @@ class stream_pool { instance().select_gpu_function = select_gpu_function; } + static void select_device(size_t gpu_id) { + instance().select_gpu_function(gpu_id); + } + private: stream_pool_implementation() = default; mutex_t pool_mut{}; From 1dd0d5260799c805006949ce87f3dd2045d6db18 Mon Sep 17 00:00:00 2001 From: Gregor Daiss <Gregor.Daiss+git@gmail.com> Date: Tue, 27 Jun 2023 10:48:11 -0500 Subject: [PATCH 15/42] Use double checked locking for aggregation again --- include/aggregation_manager.hpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/include/aggregation_manager.hpp b/include/aggregation_manager.hpp index 0269b527..57d39f15 100644 --- a/include/aggregation_manager.hpp +++ b/include/aggregation_manager.hpp @@ -675,11 +675,11 @@ template <typename Executor> class Aggregated_Executor { /// Only meant to be accessed by the slice executors bool sync_aggregation_slices(const size_t slice_launch_counter) { - std::lock_guard<aggregation_mutex_t> guard(mut); + /* std::lock_guard<aggregation_mutex_t> guard(mut); */ assert(slices_exhausted == true); // Add function call object in case it hasn't happened for this launch yet if (overall_launch_counter <= slice_launch_counter) { - /* std::lock_guard<aggregation_mutex_t> guard(mut); */ + std::lock_guard<aggregation_mutex_t> guard(mut); if (overall_launch_counter <= slice_launch_counter) { function_calls.emplace_back(current_slices, false, executor); overall_launch_counter = function_calls.size(); @@ -695,11 +695,11 @@ template <typename Executor> class Aggregated_Executor { /// Only meant to be accessed by the slice executors template <typename F, typename... Ts> void post(const size_t slice_launch_counter, F &&f, Ts &&...ts) { - std::lock_guard<aggregation_mutex_t> guard(mut); + /* std::lock_guard<aggregation_mutex_t> guard(mut); */ assert(slices_exhausted == true); // Add function call object in case it hasn't happened for this launch yet if (overall_launch_counter <= slice_launch_counter) { - /* std::lock_guard<aggregation_mutex_t> guard(mut); */ + std::lock_guard<aggregation_mutex_t> guard(mut); if (overall_launch_counter <= slice_launch_counter) { function_calls.emplace_back(current_slices, false, executor); overall_launch_counter = function_calls.size(); @@ -718,11 +718,11 @@ template <typename Executor> class Aggregated_Executor { template <typename F, typename... Ts> hpx::lcos::future<void> async(const size_t slice_launch_counter, F &&f, Ts &&...ts) { - std::lock_guard<aggregation_mutex_t> guard(mut); + /* std::lock_guard<aggregation_mutex_t> guard(mut); */ assert(slices_exhausted == true); // Add function call object in case it hasn't happened for this launch yet if (overall_launch_counter <= slice_launch_counter) { - /* std::lock_guard<aggregation_mutex_t> guard(mut); */ + std::lock_guard<aggregation_mutex_t> guard(mut); if (overall_launch_counter <= slice_launch_counter) { function_calls.emplace_back(current_slices, true, executor); overall_launch_counter = function_calls.size(); @@ -738,11 +738,11 @@ template <typename Executor> class Aggregated_Executor { template <typename F, typename... Ts> hpx::lcos::shared_future<void> wrap_async(const size_t slice_launch_counter, F &&f, Ts &&...ts) { - std::lock_guard<aggregation_mutex_t> guard(mut); + /* std::lock_guard<aggregation_mutex_t> guard(mut); */ assert(slices_exhausted == true); // Add function call object in case it hasn't happened for this launch yet if (overall_launch_counter <= slice_launch_counter) { - /* std::lock_guard<aggregation_mutex_t> guard(mut); */ + std::lock_guard<aggregation_mutex_t> guard(mut); if (overall_launch_counter <= slice_launch_counter) { function_calls.emplace_back(current_slices, true, executor); overall_launch_counter = function_calls.size(); From cdb6f5129d8c555a8ed5f8cf25dab24a8e55f942 Mon Sep 17 00:00:00 2001 From: Gregor Daiss <Gregor.Daiss+git@gmail.com> Date: Tue, 27 Jun 2023 10:48:27 -0500 Subject: [PATCH 16/42] Increase deadlock test iterations again --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5b6bd617..fb38b230 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -33,7 +33,7 @@ set(CPPUDDLE_WITH_MAX_NUMBER_WORKERS "128" CACHE STRING "Max number of workers t # Test-related options option(CPPUDDLE_WITH_COUNTERS "Turns on allocations counters. Useful for extended testing" OFF) option(CPPUDDLE_WITH_TESTS "Build tests/examples" OFF) -set(CPPUDDLE_WITH_DEADLOCK_TEST_REPETITONS "10000" CACHE STRING "Number of repetitions for the aggregation executor deadlock tests") +set(CPPUDDLE_WITH_DEADLOCK_TEST_REPETITONS "100000" CACHE STRING "Number of repetitions for the aggregation executor deadlock tests") option(CPPUDDLE_DEACTIVATE_BUFFER_RECYCLING "Deactivates the default recycling behaviour" OFF) option(CPPUDDLE_DEACTIVATE_AGGRESSIVE_ALLOCATORS "Deactivates the aggressive allocators" OFF) # Tooling options From 33be329cfee44be18b45b0b268ef60104cb0233d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= <Gregor.Daiss+git@gmail.com> Date: Tue, 27 Jun 2023 14:34:37 -0500 Subject: [PATCH 17/42] Revert "Use double checked locking for aggregation again" This reverts commit 1dd0d5260799c805006949ce87f3dd2045d6db18. Caused trouble with the deque access outside of the mutex-protected area when adding anouther call to the deque I think. --- include/aggregation_manager.hpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/include/aggregation_manager.hpp b/include/aggregation_manager.hpp index 57d39f15..0269b527 100644 --- a/include/aggregation_manager.hpp +++ b/include/aggregation_manager.hpp @@ -675,11 +675,11 @@ template <typename Executor> class Aggregated_Executor { /// Only meant to be accessed by the slice executors bool sync_aggregation_slices(const size_t slice_launch_counter) { - /* std::lock_guard<aggregation_mutex_t> guard(mut); */ + std::lock_guard<aggregation_mutex_t> guard(mut); assert(slices_exhausted == true); // Add function call object in case it hasn't happened for this launch yet if (overall_launch_counter <= slice_launch_counter) { - std::lock_guard<aggregation_mutex_t> guard(mut); + /* std::lock_guard<aggregation_mutex_t> guard(mut); */ if (overall_launch_counter <= slice_launch_counter) { function_calls.emplace_back(current_slices, false, executor); overall_launch_counter = function_calls.size(); @@ -695,11 +695,11 @@ template <typename Executor> class Aggregated_Executor { /// Only meant to be accessed by the slice executors template <typename F, typename... Ts> void post(const size_t slice_launch_counter, F &&f, Ts &&...ts) { - /* std::lock_guard<aggregation_mutex_t> guard(mut); */ + std::lock_guard<aggregation_mutex_t> guard(mut); assert(slices_exhausted == true); // Add function call object in case it hasn't happened for this launch yet if (overall_launch_counter <= slice_launch_counter) { - std::lock_guard<aggregation_mutex_t> guard(mut); + /* std::lock_guard<aggregation_mutex_t> guard(mut); */ if (overall_launch_counter <= slice_launch_counter) { function_calls.emplace_back(current_slices, false, executor); overall_launch_counter = function_calls.size(); @@ -718,11 +718,11 @@ template <typename Executor> class Aggregated_Executor { template <typename F, typename... Ts> hpx::lcos::future<void> async(const size_t slice_launch_counter, F &&f, Ts &&...ts) { - /* std::lock_guard<aggregation_mutex_t> guard(mut); */ + std::lock_guard<aggregation_mutex_t> guard(mut); assert(slices_exhausted == true); // Add function call object in case it hasn't happened for this launch yet if (overall_launch_counter <= slice_launch_counter) { - std::lock_guard<aggregation_mutex_t> guard(mut); + /* std::lock_guard<aggregation_mutex_t> guard(mut); */ if (overall_launch_counter <= slice_launch_counter) { function_calls.emplace_back(current_slices, true, executor); overall_launch_counter = function_calls.size(); @@ -738,11 +738,11 @@ template <typename Executor> class Aggregated_Executor { template <typename F, typename... Ts> hpx::lcos::shared_future<void> wrap_async(const size_t slice_launch_counter, F &&f, Ts &&...ts) { - /* std::lock_guard<aggregation_mutex_t> guard(mut); */ + std::lock_guard<aggregation_mutex_t> guard(mut); assert(slices_exhausted == true); // Add function call object in case it hasn't happened for this launch yet if (overall_launch_counter <= slice_launch_counter) { - std::lock_guard<aggregation_mutex_t> guard(mut); + /* std::lock_guard<aggregation_mutex_t> guard(mut); */ if (overall_launch_counter <= slice_launch_counter) { function_calls.emplace_back(current_slices, true, executor); overall_launch_counter = function_calls.size(); From eb242180c534ffe23b1667c96fb93a37d6d62714 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= <Gregor.Daiss+git@gmail.com> Date: Tue, 27 Jun 2023 15:14:42 -0500 Subject: [PATCH 18/42] Add default id --- include/aggregation_manager.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/aggregation_manager.hpp b/include/aggregation_manager.hpp index 0269b527..e687e106 100644 --- a/include/aggregation_manager.hpp +++ b/include/aggregation_manager.hpp @@ -917,7 +917,7 @@ template <typename Executor> class Aggregated_Executor { } Aggregated_Executor(const size_t number_slices, - Aggregated_Executor_Modes mode, const size_t gpu_id) + Aggregated_Executor_Modes mode, const size_t gpu_id = 0) : max_slices(number_slices), current_slices(0), slices_exhausted(false),dealloc_counter(0), mode(mode), executor_slices_alive(false), buffers_in_use(false), gpu_id(gpu_id), executor_tuple( From bc1edeee940ca40cada14f5143156536581ff6e1 Mon Sep 17 00:00:00 2001 From: Gregor Daiss <Gregor.Daiss+git@gmail.com> Date: Wed, 28 Jun 2023 13:01:24 -0500 Subject: [PATCH 19/42] Fix mutex type for aggregation --- include/detail/config.hpp | 3 ++- tests/work_aggregation_cuda_triad.cpp | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/include/detail/config.hpp b/include/detail/config.hpp index f3717411..0244ae9b 100644 --- a/include/detail/config.hpp +++ b/include/detail/config.hpp @@ -11,7 +11,7 @@ #if defined(CPPUDDLE_HAVE_HPX) && defined(CPPUDDLE_HAVE_HPX_MUTEX) #include <hpx/mutex.hpp> using mutex_t = hpx::spinlock_no_backoff; -using aggregation_mutex_t = hpx::spinlock_no_backoff; +using aggregation_mutex_t = hpx::mutex; #else #include <mutex> using mutex_t = std::mutex; @@ -43,6 +43,7 @@ For better performance configure CPPuddle with CPPUDDLE_WITH_HPX_AWARE_ALLOCATOR // Thread and MultiGPU configuration // constexpr size_t number_instances = CPPUDDLE_MAX_NUMBER_WORKERS; +static_assert(number_instances >= 1); constexpr size_t max_number_gpus = CPPUDDLE_MAX_NUMBER_GPUS; #ifndef CPPUDDLE_HAVE_HPX static_assert(max_number_gpus == 1, "Non HPX builds do not support multigpu"); diff --git a/tests/work_aggregation_cuda_triad.cpp b/tests/work_aggregation_cuda_triad.cpp index 4bc050b1..2ded567a 100644 --- a/tests/work_aggregation_cuda_triad.cpp +++ b/tests/work_aggregation_cuda_triad.cpp @@ -28,6 +28,7 @@ __global__ void __launch_bounds__(1024, 2) triad_kernel(float_t *A, const float_ //=============================================================================== //=============================================================================== int hpx_main(int argc, char *argv[]) { + static_assert(max_number_gpus == 1, "This test currently does not support MultiGPU builds!"); // Init parameters size_t problem_size{0}; size_t kernel_size{0}; From 7dcab62782e037121e86fadb8e1d3d2694e1e3e4 Mon Sep 17 00:00:00 2001 From: Gregor Daiss <Gregor.Daiss+git@gmail.com> Date: Wed, 28 Jun 2023 15:33:55 -0500 Subject: [PATCH 20/42] Fix kokkos test --- tests/allocator_kokkos_test.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/allocator_kokkos_test.cpp b/tests/allocator_kokkos_test.cpp index e0c7fb55..6109f71d 100644 --- a/tests/allocator_kokkos_test.cpp +++ b/tests/allocator_kokkos_test.cpp @@ -81,7 +81,7 @@ int main(int argc, char *argv[]) { for (size_t pass = 0; pass < passes; pass++) { test_view my_wrapper_test1(1000); test_view my_wrapper_test2(1000); - test_view my_wrapper_test3(127, 1000); // test 1D with location id + test_view my_wrapper_test3(number_instances - 1, 1000); // test 1D with location id double t = 2.6; Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::Serial>(0, 1000), KOKKOS_LAMBDA(const int n) { From db58db97cd5e74f5669e368a87bf5af574c9b0b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= <Gregor.Daiss+git@gmail.com> Date: Mon, 3 Jul 2023 15:16:40 -0500 Subject: [PATCH 21/42] Check for invalid location ids --- include/buffer_manager.hpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/include/buffer_manager.hpp b/include/buffer_manager.hpp index 61254efe..1e434296 100644 --- a/include/buffer_manager.hpp +++ b/include/buffer_manager.hpp @@ -221,6 +221,9 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL if (location_hint) { location_id = location_hint.value(); } + if (location_id >= number_instances) { + throw std::runtime_error("Tried to create buffer with invalid location_id [get]"); + } std::lock_guard<mutex_t> guard(instance()[location_id].mut); @@ -305,6 +308,10 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL if (location_hint) { size_t location_id = location_hint.value(); + if (location_id >= number_instances) { + throw std::runtime_error( + "Buffer recylcer received invalid location hint [mark_unused]"); + } std::lock_guard<mutex_t> guard(instance()[location_id].mut); if (instance()[location_id].buffer_map.find(memory_location) != instance()[location_id].buffer_map.end()) { From 073f4dad2edb1138b6c0ef8e6583d17a0b48ca22 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= <Gregor.Daiss+git@gmail.com> Date: Sat, 12 Aug 2023 15:55:22 -0500 Subject: [PATCH 22/42] Add hpx performance counters --- include/buffer_manager.hpp | 183 +++++++++++++++--- tests/allocator_aligned_test.cpp | 3 + tests/allocator_hpx_test.cpp | 2 + ...llocator_kokkos_executor_for_loop_test.cpp | 1 + tests/allocator_kokkos_test.cpp | 1 + tests/allocator_test.cpp | 3 + tests/work_aggregation_test.cpp | 1 + 7 files changed, 167 insertions(+), 27 deletions(-) diff --git a/include/buffer_manager.hpp b/include/buffer_manager.hpp index 1e434296..f17cc725 100644 --- a/include/buffer_manager.hpp +++ b/include/buffer_manager.hpp @@ -37,6 +37,9 @@ For better performance configure CPPuddle with CPPUDDLE_WITH_HPX_AWARE_ALLOCATOR #ifdef CPPUDDLE_HAVE_COUNTERS #include <boost/core/demangle.hpp> +#if defined(CPPUDDLE_HAVE_HPX) +#include <hpx/include/performance_counters.hpp> +#endif #endif #include "../include/detail/config.hpp" @@ -96,6 +99,17 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL return buffer_manager<T, Host_Allocator>::mark_unused(p, number_elements); } #endif + template <typename T, typename Host_Allocator> + static void register_allocator_counters_with_hpx(void) { +#ifdef CPPUDDLE_HAVE_COUNTERS + buffer_manager<T, Host_Allocator>::register_counters_with_hpx(); +#else + std::cerr << "Warning: Trying to register allocator performance counters with HPX but CPPuddle was built " + "without CPPUDDLE_WITH_COUNTERS -- operation will be ignored!" + << std::endl; +#endif + } + /// Deallocate all buffers, no matter whether they are marked as used or not static void clean_all() { std::lock_guard<mutex_t> guard(instance().callback_protection_mut); @@ -121,6 +135,20 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL } } + static void print_performance_counters() { +#ifdef CPPUDDLE_HAVE_COUNTERS + std::lock_guard<mutex_t> guard(instance().callback_protection_mut); + for (const auto &print_function : + instance().print_callbacks) { + print_function(); + } +#else + std::cerr << "Warning: Trying to print allocator performance counters but CPPuddle was built " + "without CPPUDDLE_WITH_COUNTERS -- operation will be ignored!" + << std::endl; +#endif + } + // Member variables and methods private: @@ -129,6 +157,8 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL static buffer_recycler singleton{}; return singleton; } + /// Callbacks for printing the performance counter data + std::list<std::function<void()>> print_callbacks; /// Callbacks for buffer_manager finalize - each callback completely destroys /// one buffer_manager std::list<std::function<void()>> finalize_callbacks; @@ -160,6 +190,12 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL std::lock_guard<mutex_t> guard(instance().callback_protection_mut); instance().finalize_callbacks.push_back(func); } + /// Add a callback function that gets executed upon partial (unused memory) + /// cleanup + static void add_print_callback(const std::function<void()> &func) { + std::lock_guard<mutex_t> guard(instance().callback_protection_mut); + instance().print_callbacks.push_back(func); + } public: ~buffer_recycler() = default; @@ -174,6 +210,7 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL // well using buffer_entry_type = std::tuple<T *, size_t, size_t, bool>; + public: /// Cleanup and delete this singleton static void clean() { @@ -183,6 +220,13 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL instance()[i].clean_all_buffers(); } } + static void print_performance_counters() { + assert(instance() && !is_finalized); + for (auto i = 0; i < number_instances; i++) { + std::lock_guard<mutex_t> guard(instance()[i].mut); + instance()[i].print_counters(); + } + } static void finalize() { assert(instance() && !is_finalized); is_finalized = true; @@ -207,6 +251,71 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL instance()[i].unused_buffer_list.clear(); } } +#if defined(CPPUDDLE_HAVE_COUNTERS) && defined(CPPUDDLE_HAVE_HPX) + static size_t get_sum_number_recycling(bool reset) { + if (reset) + sum_number_recycling = 0; + return sum_number_recycling; + } + static size_t get_sum_number_allocation(bool reset) { + if (reset) + sum_number_allocation = 0; + return sum_number_allocation; + } + static size_t get_sum_number_creation(bool reset) { + if (reset) + sum_number_creation = 0; + return sum_number_creation; + } + static size_t get_sum_number_deallocation(bool reset) { + if (reset) + sum_number_deallocation = 0; + return sum_number_deallocation; + } + static size_t get_sum_number_wrong_hints(bool reset) { + if (reset) + sum_number_wrong_hints = 0; + return sum_number_wrong_hints; + } + static size_t get_sum_number_bad_allocs(bool reset) { + if (reset) + sum_number_bad_allocs = 0; + return sum_number_bad_allocs; + } + + static void register_counters_with_hpx(void) { + std::string alloc_name = + boost::core::demangle(typeid(Host_Allocator).name()) + + std::string("_") + boost::core::demangle(typeid(T).name()); + hpx::performance_counters::install_counter_type( + std::string("/cppuddle/allocators/") + alloc_name + std::string("/number_recycling/"), + &get_sum_number_recycling, + "Number of allocations using a recycled buffer with this " + "allocator"); + hpx::performance_counters::install_counter_type( + std::string("/cppuddle/allocators/") + alloc_name + std::string("/number_allocations/"), + &get_sum_number_allocation, + "Number of allocations with this allocator"); + hpx::performance_counters::install_counter_type( + std::string("/cppuddle/allocators/") + alloc_name + std::string("/number_creations/"), + &get_sum_number_creation, + "Number of allocations not using a recycled buffer with this " + "allocator"); + hpx::performance_counters::install_counter_type( + std::string("/cppuddle/allocators/") + alloc_name + std::string("/number_deallocations/"), + &get_sum_number_deallocation, + "Number of deallocations yielding buffers to be recycled with this " + "allocator"); + hpx::performance_counters::install_counter_type( + std::string("/cppuddle/allocators/") + alloc_name + std::string("/number_wrong_hints/"), + &get_sum_number_wrong_hints, + "Number of wrong hints supplied to the dealloc method with this allocator"); + hpx::performance_counters::install_counter_type( + std::string("/cppuddle/allocators/") + alloc_name + std::string("/number_bad_allocs/"), + &get_sum_number_bad_allocs, + "Number of wrong bad allocs which triggered a cleanup of unused buffers"); + } +#endif /// Tries to recycle or create a buffer of type T and size number_elements. static T *get(size_t number_of_elements, bool manage_content_lifetime, @@ -229,6 +338,7 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL #ifdef CPPUDDLE_HAVE_COUNTERS instance()[location_id].number_allocation++; + sum_number_allocation++; #endif // Check for unused buffers we can recycle: for (auto iter = instance()[location_id].unused_buffer_list.begin(); @@ -250,6 +360,7 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL instance()[location_id].buffer_map.insert({std::get<0>(tuple), tuple}); #ifdef CPPUDDLE_HAVE_COUNTERS instance()[location_id].number_recycling++; + sum_number_recycling++; #endif return std::get<0>(tuple); } @@ -266,6 +377,7 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL manage_content_lifetime)}); #ifdef CPPUDDLE_HAVE_COUNTERS instance()[location_id].number_creation++; + sum_number_creation++; #endif if (manage_content_lifetime) { std::uninitialized_value_construct_n(buffer, number_of_elements); @@ -290,7 +402,9 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL manage_content_lifetime)}); #ifdef CPPUDDLE_HAVE_COUNTERS instance()[location_id].number_creation++; + sum_number_creation++; instance()[location_id].number_bad_alloc++; + sum_number_bad_allocs++; #endif std::cerr << "Second attempt allocation successful!" << std::endl; if (manage_content_lifetime) { @@ -316,7 +430,8 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL if (instance()[location_id].buffer_map.find(memory_location) != instance()[location_id].buffer_map.end()) { #ifdef CPPUDDLE_HAVE_COUNTERS - instance()[location_id].number_dealloacation++; + instance()[location_id].number_deallocation++; + sum_number_deallocation++; #endif auto it = instance()[location_id].buffer_map.find(memory_location); assert(it != instance()[location_id].buffer_map.end()); @@ -332,6 +447,7 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL // managers #ifdef CPPUDDLE_HAVE_COUNTERS instance()[location_id].number_wrong_hints++; + sum_number_wrong_hints++; #endif } @@ -345,7 +461,8 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL if (instance()[location_id].buffer_map.find(memory_location) != instance()[location_id].buffer_map.end()) { #ifdef CPPUDDLE_HAVE_COUNTERS - instance()[location_id].number_dealloacation++; + instance()[location_id].number_deallocation++; + sum_number_deallocation++; #endif auto it = instance()[location_id].buffer_map.find(memory_location); assert(it != instance()[location_id].buffer_map.end()); @@ -382,8 +499,13 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL mutex_t mut; #ifdef CPPUDDLE_HAVE_COUNTERS /// Performance counters - size_t number_allocation{0}, number_dealloacation{0}, number_wrong_hints{0}; + size_t number_allocation{0}, number_deallocation{0}, number_wrong_hints{0}; size_t number_recycling{0}, number_creation{0}, number_bad_alloc{0}; + + static inline std::atomic<size_t> sum_number_allocation{0}, + sum_number_deallocation{0}, sum_number_wrong_hints{0}; + static inline std::atomic<size_t> sum_number_recycling{0}, + sum_number_creation{0}, sum_number_bad_allocs{0}; #endif /// default, private constructor - not automatically constructed due to the /// deleted constructors @@ -412,35 +534,16 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL clean_unused_buffers_only); buffer_recycler::add_finalize_callback( finalize); +#ifdef CPPUDDLE_HAVE_COUNTERS + buffer_recycler::add_print_callback( + print_performance_counters); +#endif }); } static inline std::atomic<bool> is_finalized; - - void clean_all_buffers(void) { -#ifdef CPPUDDLE_HAVE_COUNTERS - if (number_allocation == 0 && number_recycling == 0 && - number_bad_alloc == 0 && number_creation == 0 && - unused_buffer_list.empty() && buffer_map.empty()) { - return; - } -#endif - for (auto &buffer_tuple : unused_buffer_list) { - Host_Allocator alloc; - if (std::get<3>(buffer_tuple)) { - std::destroy_n(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple)); - } - alloc.deallocate(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple)); - } - for (auto &map_tuple : buffer_map) { - auto buffer_tuple = map_tuple.second; - Host_Allocator alloc; - if (std::get<3>(buffer_tuple)) { - std::destroy_n(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple)); - } - alloc.deallocate(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple)); - } #ifdef CPPUDDLE_HAVE_COUNTERS + void print_counters(void) { // Print performance counters size_t number_cleaned = unused_buffer_list.size() + buffer_map.size(); std::cout << "\nBuffer manager destructor for (Alloc: " @@ -475,7 +578,32 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL << static_cast<float>(number_recycling) / number_allocation * 100.0f << "%" << std::endl; + } +#endif + + void clean_all_buffers(void) { +#ifdef CPPUDDLE_HAVE_COUNTERS + if (number_allocation == 0 && number_recycling == 0 && + number_bad_alloc == 0 && number_creation == 0 && + unused_buffer_list.empty() && buffer_map.empty()) { + return; + } #endif + for (auto &buffer_tuple : unused_buffer_list) { + Host_Allocator alloc; + if (std::get<3>(buffer_tuple)) { + std::destroy_n(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple)); + } + alloc.deallocate(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple)); + } + for (auto &map_tuple : buffer_map) { + auto buffer_tuple = map_tuple.second; + Host_Allocator alloc; + if (std::get<3>(buffer_tuple)) { + std::destroy_n(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple)); + } + alloc.deallocate(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple)); + } unused_buffer_list.clear(); buffer_map.clear(); #ifdef CPPUDDLE_HAVE_COUNTERS @@ -665,6 +793,7 @@ template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0> using aggressive_recycle_std = detail::aggressive_recycle_allocator<T, std::allocator<T>>; +inline void print_performance_counters() { detail::buffer_recycler::print_performance_counters(); } /// Deletes all buffers (even ones still marked as used), delete the buffer /// managers and the recycler itself inline void force_cleanup() { detail::buffer_recycler::clean_all(); } diff --git a/tests/allocator_aligned_test.cpp b/tests/allocator_aligned_test.cpp index 882a2c0c..9178dfbf 100644 --- a/tests/allocator_aligned_test.cpp +++ b/tests/allocator_aligned_test.cpp @@ -92,6 +92,7 @@ int main(int argc, char *argv[]) { std::cout << "\n==> Aggressive recycle allocation test took " << aggressive_duration << "ms" << std::endl; } + recycler::print_performance_counters(); recycler::force_cleanup(); // Cleanup all buffers and the managers for better // comparison @@ -113,6 +114,7 @@ int main(int argc, char *argv[]) { std::cout << "\n\n==> Recycle allocation test took " << recycle_duration << "ms" << std::endl; } + recycler::print_performance_counters(); recycler::force_cleanup(); // Cleanup all buffers and the managers for better // comparison @@ -144,6 +146,7 @@ int main(int argc, char *argv[]) { std::cout << "Test information: Recycler was faster than default allocator!" << std::endl; } + recycler::print_performance_counters(); #ifdef CPPUDDLE_HAVE_HPX return hpx::finalize(); #else diff --git a/tests/allocator_hpx_test.cpp b/tests/allocator_hpx_test.cpp index 4d11cc16..6ddb70ad 100644 --- a/tests/allocator_hpx_test.cpp +++ b/tests/allocator_hpx_test.cpp @@ -126,6 +126,7 @@ int hpx_main(int argc, char *argv[]) { std::cout << "\n==> Recycle allocation test took " << recycle_duration << "ms" << std::endl; } + recycler::print_performance_counters(); recycler::force_cleanup(); // Cleanup all buffers and the managers for better // comparison @@ -166,6 +167,7 @@ int hpx_main(int argc, char *argv[]) { std::cout << "\n==> Aggressive recycle allocation test took " << aggressive_duration << "ms" << std::endl; } + recycler::print_performance_counters(); recycler::force_cleanup(); // Cleanup all buffers and the managers for better // comparison diff --git a/tests/allocator_kokkos_executor_for_loop_test.cpp b/tests/allocator_kokkos_executor_for_loop_test.cpp index f2256fdd..7708fe56 100644 --- a/tests/allocator_kokkos_executor_for_loop_test.cpp +++ b/tests/allocator_kokkos_executor_for_loop_test.cpp @@ -143,6 +143,7 @@ int main(int argc, char *argv[]) { // otherwise the HPX cuda polling futures won't work hpx::cuda::experimental::detail::unregister_polling(hpx::resource::get_thread_pool(0)); + recycler::print_performance_counters(); // Cleanup all cuda views // (otherwise the cuda driver might shut down before this gets done automatically at // the end of the programm) diff --git a/tests/allocator_kokkos_test.cpp b/tests/allocator_kokkos_test.cpp index 6109f71d..055fa397 100644 --- a/tests/allocator_kokkos_test.cpp +++ b/tests/allocator_kokkos_test.cpp @@ -91,6 +91,7 @@ int main(int argc, char *argv[]) { }); Kokkos::fence(); } + recycler::print_performance_counters(); #ifdef CPPUDDLE_HAVE_HPX return hpx::finalize(); #else diff --git a/tests/allocator_test.cpp b/tests/allocator_test.cpp index e86f00ce..0d8868db 100644 --- a/tests/allocator_test.cpp +++ b/tests/allocator_test.cpp @@ -88,6 +88,7 @@ int main(int argc, char *argv[]) { std::cout << "\n\n==> Aggressive recycle allocation test took " << aggressive_duration << "ms" << std::endl; } + recycler::print_performance_counters(); recycler::force_cleanup(); // Cleanup all buffers and the managers for better // comparison @@ -107,6 +108,7 @@ int main(int argc, char *argv[]) { std::cout << "\n\n==> Recycle allocation test took " << recycle_duration << "ms" << std::endl; } + recycler::print_performance_counters(); recycler::force_cleanup(); // Cleanup all buffers and the managers for better // comparison @@ -136,6 +138,7 @@ int main(int argc, char *argv[]) { std::cout << "Test information: Recycler was faster than default allocator!" << std::endl; } + recycler::print_performance_counters(); #ifdef CPPUDDLE_HAVE_HPX return hpx::finalize(); #else diff --git a/tests/work_aggregation_test.cpp b/tests/work_aggregation_test.cpp index e4050e3f..28a37806 100644 --- a/tests/work_aggregation_test.cpp +++ b/tests/work_aggregation_test.cpp @@ -863,6 +863,7 @@ int hpx_main(int argc, char *argv[]) { std::flush(hpx::cout); sleep(1); + recycler::print_performance_counters(); recycler::force_cleanup(); // Cleanup all buffers and the managers return hpx::finalize(); } From ffdbb7bd50a1ea23be3b5201a7969395cbc4674d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= <Gregor.Daiss+git@gmail.com> Date: Tue, 15 Aug 2023 14:36:50 -0500 Subject: [PATCH 23/42] change_buckets_for_multigpu --- include/buffer_manager.hpp | 193 +++++++++++++++++++++++++-------- include/detail/config.hpp | 2 +- include/kokkos_buffer_util.hpp | 22 +++- 3 files changed, 165 insertions(+), 52 deletions(-) diff --git a/include/buffer_manager.hpp b/include/buffer_manager.hpp index f17cc725..9a37a626 100644 --- a/include/buffer_manager.hpp +++ b/include/buffer_manager.hpp @@ -74,13 +74,16 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL template <typename T, typename Host_Allocator> static T *get(size_t number_elements, bool manage_content_lifetime = false, - std::optional<size_t> location_hint = std::nullopt) { + std::optional<size_t> location_hint = std::nullopt, + std::optional<size_t> device_id = std::nullopt) { + return Host_Allocator{}.allocate(number_elements); } /// Marks an buffer as unused and fit for reusage template <typename T, typename Host_Allocator> static void mark_unused(T *p, size_t number_elements, - std::optional<size_t> location_hint = std::nullopt) { + std::optional<size_t> location_hint = std::nullopt, + std::optional<size_t> device_id = std::nullopt) { return Host_Allocator{}.deallocate(p, number_elements); } #else @@ -88,15 +91,18 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL /// buffer template <typename T, typename Host_Allocator> static T *get(size_t number_elements, bool manage_content_lifetime = false, - std::optional<size_t> location_hint = std::nullopt) { - return buffer_manager<T, Host_Allocator>::get(number_elements, - manage_content_lifetime, location_hint); + std::optional<size_t> location_hint = std::nullopt, + std::optional<size_t> device_id = std::nullopt) { + return buffer_manager<T, Host_Allocator>::get( + number_elements, manage_content_lifetime, location_hint, device_id); } /// Marks an buffer as unused and fit for reusage template <typename T, typename Host_Allocator> static void mark_unused(T *p, size_t number_elements, - std::optional<size_t> location_hint = std::nullopt) { - return buffer_manager<T, Host_Allocator>::mark_unused(p, number_elements); + std::optional<size_t> location_hint = std::nullopt, + std::optional<size_t> device_id = std::nullopt) { + return buffer_manager<T, Host_Allocator>::mark_unused(p, number_elements, + location_hint, device_id); } #endif template <typename T, typename Host_Allocator> @@ -104,11 +110,12 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL #ifdef CPPUDDLE_HAVE_COUNTERS buffer_manager<T, Host_Allocator>::register_counters_with_hpx(); #else - std::cerr << "Warning: Trying to register allocator performance counters with HPX but CPPuddle was built " + std::cerr << "Warning: Trying to register allocator performance counters " + "with HPX but CPPuddle was built " "without CPPUDDLE_WITH_COUNTERS -- operation will be ignored!" << std::endl; #endif - } + } /// Deallocate all buffers, no matter whether they are marked as used or not static void clean_all() { @@ -215,14 +222,14 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL /// Cleanup and delete this singleton static void clean() { assert(instance() && !is_finalized); - for (auto i = 0; i < number_instances; i++) { + for (auto i = 0; i < number_instances * max_number_gpus; i++) { std::lock_guard<mutex_t> guard(instance()[i].mut); instance()[i].clean_all_buffers(); } } static void print_performance_counters() { assert(instance() && !is_finalized); - for (auto i = 0; i < number_instances; i++) { + for (auto i = 0; i < number_instances * max_number_gpus; i++) { std::lock_guard<mutex_t> guard(instance()[i].mut); instance()[i].print_counters(); } @@ -230,7 +237,7 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL static void finalize() { assert(instance() && !is_finalized); is_finalized = true; - for (auto i = 0; i < number_instances; i++) { + for (auto i = 0; i < number_instances * max_number_gpus; i++) { std::lock_guard<mutex_t> guard(instance()[i].mut); instance()[i].clean_all_buffers(); } @@ -239,7 +246,7 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL /// Cleanup all buffers not currently in use static void clean_unused_buffers_only() { assert(instance() && !is_finalized); - for (auto i = 0; i < number_instances; i++) { + for (auto i = 0; i < number_instances * max_number_gpus; i++) { std::lock_guard<mutex_t> guard(instance()[i].mut); for (auto &buffer_tuple : instance()[i].unused_buffer_list) { Host_Allocator alloc; @@ -319,7 +326,8 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL /// Tries to recycle or create a buffer of type T and size number_elements. static T *get(size_t number_of_elements, bool manage_content_lifetime, - std::optional<size_t> location_hint = std::nullopt) { + std::optional<size_t> location_hint = std::nullopt, + std::optional<size_t> gpu_device_id = std::nullopt) { init_callbacks_once(); if (is_finalized) { throw std::runtime_error("Tried allocation after finalization"); @@ -328,11 +336,22 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL size_t location_id = 0; if (location_hint) { - location_id = location_hint.value(); + location_id = *location_hint; } if (location_id >= number_instances) { throw std::runtime_error("Tried to create buffer with invalid location_id [get]"); } + size_t device_id = 0; + if (gpu_device_id) { + device_id = *gpu_device_id; + } + if (device_id >= max_number_gpus) { + throw std::runtime_error("Tried to create buffer with invalid device id [get]! " + "Is multigpu support enabled with the correct number " + "of GPUs?"); + } + + location_id = location_id + device_id * number_instances; std::lock_guard<mutex_t> guard(instance()[location_id].mut); @@ -369,7 +388,7 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL // No unused buffer found -> Create new one and return it try { recycler::device_selection::select_device_functor<T, Host_Allocator>{}( - location_id / instances_per_gpu); + device_id); Host_Allocator alloc; T *buffer = alloc.allocate(number_of_elements); instance()[location_id].buffer_map.insert( @@ -395,7 +414,7 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL // We've done all we can in here Host_Allocator alloc; recycler::device_selection::select_device_functor<T, Host_Allocator>{}( - location_id / instances_per_gpu); + device_id); T *buffer = alloc.allocate(number_of_elements); instance()[location_id].buffer_map.insert( {buffer, std::make_tuple(buffer, number_of_elements, 1, @@ -415,17 +434,32 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL } static void mark_unused(T *memory_location, size_t number_of_elements, - std::optional<size_t> location_hint = std::nullopt) { + std::optional<size_t> location_hint = std::nullopt, + std::optional<size_t> device_hint = std::nullopt) { if (is_finalized) return; assert(instance() && !is_finalized); + size_t location_id = 0; if (location_hint) { - size_t location_id = location_hint.value(); + location_id = *location_hint; if (location_id >= number_instances) { throw std::runtime_error( "Buffer recylcer received invalid location hint [mark_unused]"); } + } + size_t device_id = 0; + if (device_hint) { + device_id = *device_hint; + if (device_id >= max_number_gpus) { + throw std::runtime_error( + "Buffer recylcer received invalid devce hint [mark_unused]"); + } + } + + // Attempt 1 to find the correct bucket/location: Look at provided hint: + if (location_hint) { + size_t location_id = location_hint.value() + device_id * number_instances; std::lock_guard<mutex_t> guard(instance()[location_id].mut); if (instance()[location_id].buffer_map.find(memory_location) != instance()[location_id].buffer_map.end()) { @@ -443,19 +477,20 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL instance()[location_id].buffer_map.erase(memory_location); return; // Success } - // hint was wrong - note that, and continue on with all other buffer - // managers + // hint was wrong #ifdef CPPUDDLE_HAVE_COUNTERS instance()[location_id].number_wrong_hints++; sum_number_wrong_hints++; #endif } - - for(size_t location_id = 0; location_id < number_instances; location_id++) { + // Failed to find buffer in the specified localtion/device! + // Attempt 2 - Look for buffer other locations on the same device... + for (size_t location_id = device_id * number_instances; + location_id < (device_id + 1) * number_instances; location_id++) { if (location_hint) { - if (location_hint.value() == location_id) { - continue; // already tried this -> skip - } + if (*location_hint + device_id * max_number_gpus == location_id) { + continue; // already tried this -> skip + } } std::lock_guard<mutex_t> guard(instance()[location_id].mut); if (instance()[location_id].buffer_map.find(memory_location) != @@ -475,6 +510,64 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL return; // Success } } + // Failed to find buffer on the specified device! + // Attempt 3 - Look for buffer on other devices... + for (size_t local_device_id = 0; local_device_id < max_number_gpus; + local_device_id++) { + if (local_device_id == device_id) + continue; // aldready tried this device + + // Try hint localtion first yet again (though on different device) + if (location_hint) { + size_t location_id = location_hint.value() + local_device_id * number_instances; + std::lock_guard<mutex_t> guard(instance()[location_id].mut); + if (instance()[location_id].buffer_map.find(memory_location) != + instance()[location_id].buffer_map.end()) { +#ifdef CPPUDDLE_HAVE_COUNTERS + instance()[location_id].number_deallocation++; + sum_number_deallocation++; +#endif + auto it = instance()[location_id].buffer_map.find(memory_location); + assert(it != instance()[location_id].buffer_map.end()); + auto &tuple = it->second; + // sanity checks: + assert(std::get<1>(tuple) == number_of_elements); + // move to the unused_buffer list + instance()[location_id].unused_buffer_list.push_front(tuple); + instance()[location_id].buffer_map.erase(memory_location); + return; // Success + } + } + // Failed - check all other localtions on device + for (size_t location_id = local_device_id * number_instances; + location_id < (local_device_id + 1) * number_instances; location_id++) { + if (location_hint) { + if (*location_hint + local_device_id * max_number_gpus == location_id) { + continue; // already tried this -> skip + } + } + std::lock_guard<mutex_t> guard(instance()[location_id].mut); + if (instance()[location_id].buffer_map.find(memory_location) != + instance()[location_id].buffer_map.end()) { +#ifdef CPPUDDLE_HAVE_COUNTERS + instance()[location_id].number_deallocation++; + sum_number_deallocation++; +#endif + auto it = instance()[location_id].buffer_map.find(memory_location); + assert(it != instance()[location_id].buffer_map.end()); + auto &tuple = it->second; + // sanity checks: + assert(std::get<1>(tuple) == number_of_elements); + // move to the unused_buffer list + instance()[location_id].unused_buffer_list.push_front(tuple); + instance()[location_id].buffer_map.erase(memory_location); + return; // Success + } + } + } + // Buffer that is to be deleted is nowhere to be found - we looked everywhere! + // => + // Failure! Handle here... // TODO Throw exception instead in the futures, as soon as the recycler finalize is // in all user codes @@ -488,7 +581,7 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL << "Warning! Tried to delete non-existing buffer within CPPuddle!" << std::endl; std::cerr << "Did you forget to call recycler::finalize?" << std::endl; - } + } private: /// List with all buffers still in usage @@ -516,7 +609,7 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL operator=(buffer_manager<T, Host_Allocator> &&other) = delete; static std::unique_ptr<buffer_manager[]>& instance(void) { static std::unique_ptr<buffer_manager[]> instances{ - new buffer_manager[number_instances]}; + new buffer_manager[number_instances * max_number_gpus]}; return instances; } static void init_callbacks_once(void) { @@ -544,6 +637,8 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL #ifdef CPPUDDLE_HAVE_COUNTERS void print_counters(void) { + if (number_allocation == 0) + return; // Print performance counters size_t number_cleaned = unused_buffer_list.size() + buffer_map.size(); std::cout << "\nBuffer manager destructor for (Alloc: " @@ -642,15 +737,16 @@ template <typename T, typename Host_Allocator> struct recycle_allocator { using underlying_allocator_type = Host_Allocator; static_assert(std::is_same_v<value_type, typename underlying_allocator_type::value_type>); const std::optional<size_t> dealloc_hint; + const std::optional<size_t> device_id; #ifndef CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS recycle_allocator() noexcept - : dealloc_hint(std::nullopt) {} + : dealloc_hint(std::nullopt), device_id(std::nullopt) {} explicit recycle_allocator(size_t hint) noexcept - : dealloc_hint(std::nullopt) {} + : dealloc_hint(std::nullopt), device_id(std::nullopt) {} explicit recycle_allocator( recycle_allocator<T, Host_Allocator> const &other) noexcept - : dealloc_hint(std::nullopt) {} + : dealloc_hint(std::nullopt), device_id(std::nullopt) {} T *allocate(std::size_t n) { T *data = buffer_recycler::get<T, Host_Allocator>(n); return data; @@ -660,19 +756,20 @@ template <typename T, typename Host_Allocator> struct recycle_allocator { } #else recycle_allocator() noexcept - : dealloc_hint(hpx::get_worker_thread_num()) {} - explicit recycle_allocator(size_t hint) noexcept - : dealloc_hint(hint) {} + : dealloc_hint(hpx::get_worker_thread_num()), device_id(0) {} + explicit recycle_allocator(const size_t device_id) noexcept + : dealloc_hint(hint), device_id(device_id) {} explicit recycle_allocator( recycle_allocator<T, Host_Allocator> const &other) noexcept - : dealloc_hint(other.dealloc_hint) {} + : dealloc_hint(other.dealloc_hint), device_id(other.device_id) {} T *allocate(std::size_t n) { T *data = buffer_recycler::get<T, Host_Allocator>( - n, false, hpx::get_worker_thread_num()); + n, false, hpx::get_worker_thread_num(), device_id); return data; } void deallocate(T *p, std::size_t n) { - buffer_recycler::mark_unused<T, Host_Allocator>(p, n, dealloc_hint); + buffer_recycler::mark_unused<T, Host_Allocator>(p, n, dealloc_hint, + device_id); } #endif @@ -707,16 +804,17 @@ struct aggressive_recycle_allocator { using value_type = T; using underlying_allocator_type = Host_Allocator; static_assert(std::is_same_v<value_type, typename underlying_allocator_type::value_type>); - std::optional<size_t> dealloc_hint; + const std::optional<size_t> dealloc_hint; + const std::optional<size_t> device_id; #ifndef CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS aggressive_recycle_allocator() noexcept - : dealloc_hint(std::nullopt) {} + : dealloc_hint(std::nullopt), device_id(std::nullopt) {} explicit aggressive_recycle_allocator(size_t hint) noexcept - : dealloc_hint(std::nullopt) {} + : dealloc_hint(std::nullopt), device_id(std::nullopt) {} explicit aggressive_recycle_allocator( aggressive_recycle_allocator<T, Host_Allocator> const &) noexcept - : dealloc_hint(std::nullopt) {} + : dealloc_hint(std::nullopt), device_id(std::nullopt) {} T *allocate(std::size_t n) { T *data = buffer_recycler::get<T, Host_Allocator>( n, true); // also initializes the buffer if it isn't reused @@ -727,20 +825,21 @@ struct aggressive_recycle_allocator { } #else aggressive_recycle_allocator() noexcept - : dealloc_hint(hpx::get_worker_thread_num()) {} - explicit aggressive_recycle_allocator(size_t hint) noexcept - : dealloc_hint(hint) {} + : dealloc_hint(hpx::get_worker_thread_num()), device_id(0) {} + explicit aggressive_recycle_allocator(const size_t device_id) noexcept + : device_id(device_id) {} explicit aggressive_recycle_allocator( recycle_allocator<T, Host_Allocator> const &other) noexcept - : dealloc_hint(other.dealloc_hint) {} + : dealloc_hint(other.dealloc_hint), device_id(other.device_id) {} T *allocate(std::size_t n) { T *data = buffer_recycler::get<T, Host_Allocator>( - n, true, hpx::get_worker_thread_num()); // also initializes the buffer + n, true, dealloc_hint, device_id); // also initializes the buffer // if it isn't reused return data; } void deallocate(T *p, std::size_t n) { - buffer_recycler::mark_unused<T, Host_Allocator>(p, n, dealloc_hint); + buffer_recycler::mark_unused<T, Host_Allocator>(p, n, dealloc_hint, + device_id); } #endif diff --git a/include/detail/config.hpp b/include/detail/config.hpp index 0244ae9b..56ac7b8d 100644 --- a/include/detail/config.hpp +++ b/include/detail/config.hpp @@ -51,7 +51,7 @@ static_assert(max_number_gpus == 1, "Non HPX builds do not support multigpu"); static_assert(number_instances >= max_number_gpus); static_assert(max_number_gpus > 0); static_assert(number_instances > 0); -constexpr size_t instances_per_gpu = number_instances / max_number_gpus; +//constexpr size_t instances_per_gpu = number_instances / max_number_gpus; /// Uses HPX thread information to determine which GPU should be used inline size_t get_device_id(void) { diff --git a/include/kokkos_buffer_util.hpp b/include/kokkos_buffer_util.hpp index 52413cce..f6cf04fa 100644 --- a/include/kokkos_buffer_util.hpp +++ b/include/kokkos_buffer_util.hpp @@ -101,15 +101,29 @@ class recycled_view : public kokkos_type { data_ref_counter(this->data(), view_deleter<element_type, alloc_type>( allocator, total_elements)) {} + // TODO Add version with only a device parameter -- should use get but come with a different + // view deleter that just uses mark unused + + + // TODO NExt up: Add similar mechanism to aggregatation manager + + + // TODO Add similar mechanism to cuda_device_buffer + + + // TODO Switch Octo-Tiger hydro kokkos solver to this (should mostly just + // require + + // TODO These are meant to get the static data (predicatable location_id really required?) template <typename... Args, std::enable_if_t<sizeof...(Args) == kokkos_type::rank, bool> = true> - recycled_view(std::size_t location_id, Args... args) + recycled_view(std::size_t device_id, std::size_t location_id, Args... args) : kokkos_type( detail::buffer_recycler::get< element_type, typename alloc_type::underlying_allocator_type>( kokkos_type::required_allocation_size(args...) / sizeof(element_type), - false, location_id), + false, location_id, device_id), args...), total_elements(kokkos_type::required_allocation_size(args...) / sizeof(element_type)), @@ -119,13 +133,13 @@ class recycled_view : public kokkos_type { template < typename layout_t, std::enable_if_t<Kokkos::is_array_layout<layout_t>::value, bool> = true> - recycled_view(std::size_t location_id, layout_t layout) + recycled_view(std::size_t device_id, std::size_t location_id, layout_t layout) : kokkos_type( detail::buffer_recycler::get< element_type, typename alloc_type::underlying_allocator_type>( kokkos_type::required_allocation_size(layout) / sizeof(element_type), - false, location_id), + false, location_id, device_id), layout), total_elements(kokkos_type::required_allocation_size(layout) / sizeof(element_type)), From 7880d4b9c71eebeb60ea7840479daf49c90cac87 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= <Gregor.Daiss+git@gmail.com> Date: Fri, 18 Aug 2023 12:52:13 -0500 Subject: [PATCH 24/42] Remove location_id from user interface --- include/aggregation_manager.hpp | 17 ++++++++----- include/buffer_manager.hpp | 37 ++++++++++++++++++++------- include/cuda_buffer_util.hpp | 41 +++++++++--------------------- include/detail/config.hpp | 3 ++- include/kokkos_buffer_util.hpp | 45 ++++++++++----------------------- 5 files changed, 65 insertions(+), 78 deletions(-) diff --git a/include/aggregation_manager.hpp b/include/aggregation_manager.hpp index e687e106..8961226e 100644 --- a/include/aggregation_manager.hpp +++ b/include/aggregation_manager.hpp @@ -532,9 +532,9 @@ template <typename Executor> class Aggregated_Executor { aggregation_mutex_t mut; /// Data entry for a buffer allocation: void* pointer, size_t for - /// buffer-size, atomic for the slice counter + /// buffer-size, atomic for the slice counter, location_id, gpu_id using buffer_entry_t = - std::tuple<void*, const size_t, std::atomic<size_t>, bool, const size_t>; + std::tuple<void*, const size_t, std::atomic<size_t>, bool, const size_t, size_t>; /// Keeps track of the aggregated buffer allocations done in all the slices std::deque<buffer_entry_t> buffer_allocations; /// Map pointer to deque index for fast access in the deallocations @@ -560,14 +560,16 @@ template <typename Executor> class Aggregated_Executor { // Default location -- useful for GPU builds as we otherwise create way too // many different buffers for different aggregation sizes on different GPUs - size_t location_id = gpu_id * instances_per_gpu; + /* size_t location_id = gpu_id * instances_per_gpu; */ + // Use integer conversion to only use 0 16 32 ... as buckets + size_t location_id = (hpx::get_worker_thread_num() / 16) * 16; #ifdef CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS if (max_slices == 1) { // get prefered location: aka the current hpx threads location // Usually handy for CPU builds where we want to use the buffers // close to the current CPU core /* location_id = (hpx::get_worker_thread_num() / instances_per_gpu) * instances_per_gpu; */ - location_id = (gpu_id) * instances_per_gpu; + /* location_id = (gpu_id) * instances_per_gpu; */ // division makes sure that we always use the same instance to store our gpu buffers. } #endif @@ -576,10 +578,10 @@ template <typename Executor> class Aggregated_Executor { // buffer_recycler... T *aggregated_buffer = recycler::detail::buffer_recycler::get<T, Host_Allocator>( - size, manage_content_lifetime, location_id); + size, manage_content_lifetime, location_id, gpu_id); // Create buffer entry for this buffer buffer_allocations.emplace_back(static_cast<void *>(aggregated_buffer), - size, 1, true, location_id); + size, 1, true, location_id, gpu_id); #ifndef NDEBUG // if previousely used the buffer should not be in usage anymore @@ -633,6 +635,7 @@ template <typename Executor> class Aggregated_Executor { auto &buffer_allocation_counter = std::get<2>(buffer_allocations[slice_alloc_counter]); auto &valid = std::get<3>(buffer_allocations[slice_alloc_counter]); const auto &location_id = std::get<4>(buffer_allocations[slice_alloc_counter]); + const auto &gpu_id = std::get<5>(buffer_allocations[slice_alloc_counter]); assert(valid); T *buffer_pointer = static_cast<T *>(buffer_pointer_void); @@ -650,7 +653,7 @@ template <typename Executor> class Aggregated_Executor { if (valid) { assert(buffers_in_use == true); recycler::detail::buffer_recycler::mark_unused<T, Host_Allocator>( - buffer_pointer, buffer_size, location_id); + buffer_pointer, buffer_size, location_id, gpu_id); // mark buffer as invalid to prevent any other slice from marking the // buffer as unused valid = false; diff --git a/include/buffer_manager.hpp b/include/buffer_manager.hpp index 9a37a626..ab4b5144 100644 --- a/include/buffer_manager.hpp +++ b/include/buffer_manager.hpp @@ -284,6 +284,11 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL sum_number_wrong_hints = 0; return sum_number_wrong_hints; } + static size_t get_sum_number_wrong_device_hints(bool reset) { + if (reset) + sum_number_wrong_hints = 0; + return sum_number_wrong_device_hints; + } static size_t get_sum_number_bad_allocs(bool reset) { if (reset) sum_number_bad_allocs = 0; @@ -317,6 +322,10 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL std::string("/cppuddle/allocators/") + alloc_name + std::string("/number_wrong_hints/"), &get_sum_number_wrong_hints, "Number of wrong hints supplied to the dealloc method with this allocator"); + hpx::performance_counters::install_counter_type( + std::string("/cppuddle/allocators/") + alloc_name + std::string("/number_wrong_device_hints/"), + &get_sum_number_wrong_device_hints, + "Number of wrong device hints supplied to the dealloc method with this allocator"); hpx::performance_counters::install_counter_type( std::string("/cppuddle/allocators/") + alloc_name + std::string("/number_bad_allocs/"), &get_sum_number_bad_allocs, @@ -510,6 +519,12 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL return; // Success } } + // device hint was wrong +#ifdef CPPUDDLE_HAVE_COUNTERS + if (device_hint) { + sum_number_wrong_device_hints++; + } +#endif // Failed to find buffer on the specified device! // Attempt 3 - Look for buffer on other devices... for (size_t local_device_id = 0; local_device_id < max_number_gpus; @@ -592,17 +607,17 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL mutex_t mut; #ifdef CPPUDDLE_HAVE_COUNTERS /// Performance counters - size_t number_allocation{0}, number_deallocation{0}, number_wrong_hints{0}; - size_t number_recycling{0}, number_creation{0}, number_bad_alloc{0}; + size_t number_allocation{0}, number_deallocation{0}, number_wrong_hints{0}, + number_recycling{0}, number_creation{0}, number_bad_alloc{0}; static inline std::atomic<size_t> sum_number_allocation{0}, - sum_number_deallocation{0}, sum_number_wrong_hints{0}; - static inline std::atomic<size_t> sum_number_recycling{0}, + sum_number_deallocation{0}, sum_number_wrong_hints{0}, + sum_number_wrong_device_hints{0}, sum_number_recycling{0}, sum_number_creation{0}, sum_number_bad_allocs{0}; #endif - /// default, private constructor - not automatically constructed due to the - /// deleted constructors - buffer_manager() = default; + /// default, private constructor - not automatically constructed due to + /// the deleted constructors + buffer_manager() = default; buffer_manager& operator=(buffer_manager<T, Host_Allocator> const &other) = default; buffer_manager& @@ -758,7 +773,9 @@ template <typename T, typename Host_Allocator> struct recycle_allocator { recycle_allocator() noexcept : dealloc_hint(hpx::get_worker_thread_num()), device_id(0) {} explicit recycle_allocator(const size_t device_id) noexcept - : dealloc_hint(hint), device_id(device_id) {} + : dealloc_hint(hpx::get_worker_thread_num()), device_id(device_id) {} + explicit recycle_allocator(const size_t device_i, const size_t location_id) noexcept + : dealloc_hint(location_id), device_id(device_id) {} explicit recycle_allocator( recycle_allocator<T, Host_Allocator> const &other) noexcept : dealloc_hint(other.dealloc_hint), device_id(other.device_id) {} @@ -827,7 +844,9 @@ struct aggressive_recycle_allocator { aggressive_recycle_allocator() noexcept : dealloc_hint(hpx::get_worker_thread_num()), device_id(0) {} explicit aggressive_recycle_allocator(const size_t device_id) noexcept - : device_id(device_id) {} + : dealloc_hint(hpx::get_worker_thread_num()), device_id(device_id) {} + explicit aggressive_recycle_allocator(const size_t device_id, const size_t location_id) noexcept + : dealloc_hint(location_id), device_id(device_id) {} explicit aggressive_recycle_allocator( recycle_allocator<T, Host_Allocator> const &other) noexcept : dealloc_hint(other.dealloc_hint), device_id(other.device_id) {} diff --git a/include/cuda_buffer_util.hpp b/include/cuda_buffer_util.hpp index 8fb57d4f..e454d8c0 100644 --- a/include/cuda_buffer_util.hpp +++ b/include/cuda_buffer_util.hpp @@ -111,24 +111,18 @@ using recycle_allocator_cuda_device = template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0> struct cuda_device_buffer { - size_t gpu_id{0}; + recycle_allocator_cuda_device<T> allocator; T *device_side_buffer; size_t number_of_elements; - explicit cuda_device_buffer(size_t number_of_elements) - : number_of_elements(number_of_elements) { - device_side_buffer = - recycle_allocator_cuda_device<T>{}.allocate(number_of_elements); - } - // TODO deprecate and remove gpu_id - explicit cuda_device_buffer(size_t number_of_elements, size_t gpu_id) - : gpu_id(gpu_id), number_of_elements(number_of_elements), set_id(true) { - assert(gpu_id == 0); + + cuda_device_buffer(const size_t number_of_elements, const size_t device_id = 0) + : allocator{device_id}, number_of_elements(number_of_elements) { + assert(device_id < max_number_gpus); device_side_buffer = - recycle_allocator_cuda_device<T>{}.allocate(number_of_elements); + allocator.allocate(number_of_elements); } ~cuda_device_buffer() { - recycle_allocator_cuda_device<T>{}.deallocate(device_side_buffer, - number_of_elements); + allocator.deallocate(device_side_buffer, number_of_elements); } // not yet implemented cuda_device_buffer(cuda_device_buffer const &other) = delete; @@ -136,30 +130,19 @@ struct cuda_device_buffer { cuda_device_buffer(cuda_device_buffer const &&other) = delete; cuda_device_buffer operator=(cuda_device_buffer const &&other) = delete; -private: - bool set_id{false}; }; template <typename T, typename Host_Allocator, std::enable_if_t<std::is_trivial<T>::value, int> = 0> struct cuda_aggregated_device_buffer { - size_t gpu_id{0}; T *device_side_buffer; size_t number_of_elements; - explicit cuda_aggregated_device_buffer(size_t number_of_elements) - : number_of_elements(number_of_elements) { - device_side_buffer = - recycle_allocator_cuda_device<T>{}.allocate(number_of_elements); - } - // TODO deprecate and remove gpu_id - explicit cuda_aggregated_device_buffer(size_t number_of_elements, size_t gpu_id, Host_Allocator &alloc) - : gpu_id(gpu_id), number_of_elements(number_of_elements), set_id(true), alloc(alloc) { - assert(gpu_id == 0); + explicit cuda_aggregated_device_buffer(size_t number_of_elements, Host_Allocator &alloc) + : number_of_elements(number_of_elements), alloc(alloc) { device_side_buffer = alloc.allocate(number_of_elements); } ~cuda_aggregated_device_buffer() { - alloc.deallocate(device_side_buffer, - number_of_elements); + alloc.deallocate(device_side_buffer, number_of_elements); } // not yet implemented cuda_aggregated_device_buffer(cuda_aggregated_device_buffer const &other) = delete; @@ -168,8 +151,8 @@ struct cuda_aggregated_device_buffer { cuda_aggregated_device_buffer operator=(cuda_aggregated_device_buffer const &&other) = delete; private: - bool set_id{false}; - Host_Allocator &alloc; + Host_Allocator &alloc; // will stay valid for the entire aggregation region and hence + // for the entire lifetime of this buffer }; namespace device_selection { diff --git a/include/detail/config.hpp b/include/detail/config.hpp index 56ac7b8d..ea6e9a26 100644 --- a/include/detail/config.hpp +++ b/include/detail/config.hpp @@ -56,7 +56,8 @@ static_assert(number_instances > 0); /// Uses HPX thread information to determine which GPU should be used inline size_t get_device_id(void) { #if defined(CPPUDDLE_HAVE_HPX) - return hpx::get_worker_thread_num() / instances_per_gpu; + //return hpx::get_worker_thread_num() / max_num_gpus; + return 0; #else return 0; #endif diff --git a/include/kokkos_buffer_util.hpp b/include/kokkos_buffer_util.hpp index f6cf04fa..2945b422 100644 --- a/include/kokkos_buffer_util.hpp +++ b/include/kokkos_buffer_util.hpp @@ -16,7 +16,7 @@ template<typename element_type, typename alloc_type> struct view_deleter { alloc_type allocator; size_t total_elements; - view_deleter(alloc_type &alloc, size_t total_elements) : allocator(alloc), + view_deleter(alloc_type alloc, size_t total_elements) : allocator(alloc), total_elements(total_elements) {} void operator()(element_type* p) { allocator.deallocate(p, total_elements); @@ -29,6 +29,7 @@ class aggregated_recycled_view : public kokkos_type { alloc_type allocator; size_t total_elements{0}; std::shared_ptr<element_type> data_ref_counter; + static_assert(std::is_same_v<element_type, typename alloc_type::value_type>); public: using view_type = kokkos_type; @@ -79,10 +80,10 @@ class aggregated_recycled_view : public kokkos_type { ~aggregated_recycled_view() {} }; + template <typename kokkos_type, typename alloc_type, typename element_type> class recycled_view : public kokkos_type { private: - alloc_type allocator; size_t total_elements{0}; std::shared_ptr<element_type> data_ref_counter; @@ -93,58 +94,38 @@ class recycled_view : public kokkos_type { std::enable_if_t<sizeof...(Args) == kokkos_type::rank, bool> = true> recycled_view(Args... args) : kokkos_type( - allocator.allocate(kokkos_type::required_allocation_size(args...) / + alloc_type{}.allocate(kokkos_type::required_allocation_size(args...) / sizeof(element_type)), args...), total_elements(kokkos_type::required_allocation_size(args...) / sizeof(element_type)), data_ref_counter(this->data(), view_deleter<element_type, alloc_type>( - allocator, total_elements)) {} - - // TODO Add version with only a device parameter -- should use get but come with a different - // view deleter that just uses mark unused - + alloc_type{}, total_elements)) {} - // TODO NExt up: Add similar mechanism to aggregatation manager - - - // TODO Add similar mechanism to cuda_device_buffer - - - // TODO Switch Octo-Tiger hydro kokkos solver to this (should mostly just - // require - - // TODO These are meant to get the static data (predicatable location_id really required?) template <typename... Args, std::enable_if_t<sizeof...(Args) == kokkos_type::rank, bool> = true> - recycled_view(std::size_t device_id, std::size_t location_id, Args... args) + recycled_view(const size_t device_id, Args... args) : kokkos_type( - detail::buffer_recycler::get< - element_type, typename alloc_type::underlying_allocator_type>( - kokkos_type::required_allocation_size(args...) / - sizeof(element_type), - false, location_id, device_id), + alloc_type{device_id}.allocate(kokkos_type::required_allocation_size(args...) / + sizeof(element_type)), args...), total_elements(kokkos_type::required_allocation_size(args...) / sizeof(element_type)), data_ref_counter(this->data(), view_deleter<element_type, alloc_type>( - allocator, total_elements)) {} + alloc_type{device_id}, total_elements)) {} template < typename layout_t, std::enable_if_t<Kokkos::is_array_layout<layout_t>::value, bool> = true> - recycled_view(std::size_t device_id, std::size_t location_id, layout_t layout) + recycled_view(std::size_t device_id, layout_t layout) : kokkos_type( - detail::buffer_recycler::get< - element_type, typename alloc_type::underlying_allocator_type>( - kokkos_type::required_allocation_size(layout) / - sizeof(element_type), - false, location_id, device_id), + alloc_type{device_id}.allocate(kokkos_type::required_allocation_size(layout) / + sizeof(element_type)), layout), total_elements(kokkos_type::required_allocation_size(layout) / sizeof(element_type)), data_ref_counter(this->data(), view_deleter<element_type, alloc_type>( - allocator, total_elements)) {} + alloc_type{device_id}, total_elements)) {} recycled_view( const recycled_view<kokkos_type, alloc_type, element_type> &other) From f3d71c83d3e0cb82d97f3ddcdce1e07c5d3dea1c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= <Gregor.Daiss+git@gmail.com> Date: Fri, 18 Aug 2023 13:36:59 -0500 Subject: [PATCH 25/42] WIP, add gpu_id to stream managers --- include/aggregation_manager.hpp | 24 +++++++++++++++++++++--- include/detail/config.hpp | 6 +++--- 2 files changed, 24 insertions(+), 6 deletions(-) diff --git a/include/aggregation_manager.hpp b/include/aggregation_manager.hpp index 8961226e..4d91b014 100644 --- a/include/aggregation_manager.hpp +++ b/include/aggregation_manager.hpp @@ -6,6 +6,7 @@ #ifndef WORK_AGGREGATION_MANAGER #define WORK_AGGREGATION_MANAGER +#include <stdexcept> #define DEBUG_AGGREGATION_CALLS 1 #include <stdio.h> @@ -1014,8 +1015,23 @@ class aggregation_pool { /// interface template <typename... Ts> static void init(size_t number_of_executors, size_t slices_per_executor, - Aggregated_Executor_Modes mode) { - for (size_t gpu_id = 0; gpu_id < max_number_gpus; gpu_id++) { + Aggregated_Executor_Modes mode, std::optional<size_t> overwrite_number_devices) { + if (is_initialized) { + throw std::runtime_error( + std::string("Trying to initialize cppuddle aggregation pool twice") + + " Kernel: " + std::string(kernelname)); + } + if (number_devices) { + if (*overwrite_number_devices > max_number_gpus) { + throw std::runtime_error( + std::string( + "Trying to initialize aggregation with more devices than the " + "maximum number of GPUs given at compiletime") + + " Kernel: " + std::string(kernelname)); + } + number_devices = *overwrite_number_devices; + } + for (size_t gpu_id = 0; gpu_id < number_devices; gpu_id++) { std::lock_guard<aggregation_mutex_t> guard(instance()[gpu_id].pool_mutex); assert(instance()[gpu_id].aggregation_executor_pool.empty()); for (int i = 0; i < number_of_executors; i++) { @@ -1029,7 +1045,7 @@ class aggregation_pool { /// Will always return a valid executor slice static decltype(auto) request_executor_slice(void) { - const size_t gpu_id = get_device_id(); + const size_t gpu_id = get_device_id(number_devices); /* const size_t gpu_id = 1; */ std::lock_guard<aggregation_mutex_t> guard(instance()[gpu_id].pool_mutex); assert(!instance()[gpu_id].aggregation_executor_pool.empty()); @@ -1090,6 +1106,8 @@ class aggregation_pool { new aggregation_pool[max_number_gpus]}; return pool_instances; } + static inline size_t number_devices = max_number_gpus; + static inline bool is_initialized = false; aggregation_pool() = default; public: diff --git a/include/detail/config.hpp b/include/detail/config.hpp index ea6e9a26..7dd8837f 100644 --- a/include/detail/config.hpp +++ b/include/detail/config.hpp @@ -54,10 +54,10 @@ static_assert(number_instances > 0); //constexpr size_t instances_per_gpu = number_instances / max_number_gpus; /// Uses HPX thread information to determine which GPU should be used -inline size_t get_device_id(void) { +inline size_t get_device_id(const size_t number_gpus) { #if defined(CPPUDDLE_HAVE_HPX) - //return hpx::get_worker_thread_num() / max_num_gpus; - return 0; + assert(number_gpus < max_number_gpus); + return hpx::get_worker_thread_num() / number_gpus; #else return 0; #endif From 1026cd5c42fef625e668a8f7c74ddda08cee0da7 Mon Sep 17 00:00:00 2001 From: Gregor Daiss <Gregor.Daiss+git@gmail.com> Date: Sat, 19 Aug 2023 14:05:10 -0500 Subject: [PATCH 26/42] Fix multigpu --- include/aggregation_manager.hpp | 31 +++++++++++--------- include/buffer_manager.hpp | 10 +++---- include/detail/config.hpp | 5 ++-- include/stream_manager.hpp | 51 +++++++++++++++++---------------- 4 files changed, 52 insertions(+), 45 deletions(-) diff --git a/include/aggregation_manager.hpp b/include/aggregation_manager.hpp index 4d91b014..35338bfc 100644 --- a/include/aggregation_manager.hpp +++ b/include/aggregation_manager.hpp @@ -563,7 +563,7 @@ template <typename Executor> class Aggregated_Executor { // many different buffers for different aggregation sizes on different GPUs /* size_t location_id = gpu_id * instances_per_gpu; */ // Use integer conversion to only use 0 16 32 ... as buckets - size_t location_id = (hpx::get_worker_thread_num() / 16) * 16; + size_t location_id = ((hpx::get_worker_thread_num() % number_instances) / 16) * 16; #ifdef CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS if (max_slices == 1) { // get prefered location: aka the current hpx threads location @@ -1015,23 +1015,22 @@ class aggregation_pool { /// interface template <typename... Ts> static void init(size_t number_of_executors, size_t slices_per_executor, - Aggregated_Executor_Modes mode, std::optional<size_t> overwrite_number_devices) { + Aggregated_Executor_Modes mode, size_t num_devices = 1) { if (is_initialized) { throw std::runtime_error( std::string("Trying to initialize cppuddle aggregation pool twice") + - " Kernel: " + std::string(kernelname)); + " Agg pool name: " + std::string(kernelname)); } - if (number_devices) { - if (*overwrite_number_devices > max_number_gpus) { - throw std::runtime_error( - std::string( - "Trying to initialize aggregation with more devices than the " - "maximum number of GPUs given at compiletime") + - " Kernel: " + std::string(kernelname)); - } - number_devices = *overwrite_number_devices; + if (num_devices > max_number_gpus) { + throw std::runtime_error( + std::string( + "Trying to initialize aggregation with more devices than the " + "maximum number of GPUs given at compiletime") + + " Agg pool name: " + std::string(kernelname)); } + number_devices = num_devices; for (size_t gpu_id = 0; gpu_id < number_devices; gpu_id++) { + std::lock_guard<aggregation_mutex_t> guard(instance()[gpu_id].pool_mutex); assert(instance()[gpu_id].aggregation_executor_pool.empty()); for (int i = 0; i < number_of_executors; i++) { @@ -1041,10 +1040,16 @@ class aggregation_pool { instance()[gpu_id].slices_per_executor = slices_per_executor; instance()[gpu_id].mode = mode; } + is_initialized = true; } /// Will always return a valid executor slice static decltype(auto) request_executor_slice(void) { + if (!is_initialized) { + throw std::runtime_error( + std::string("Trying to use cppuddle aggregation pool without first calling init") + + " Agg poolname: " + std::string(kernelname)); + } const size_t gpu_id = get_device_id(number_devices); /* const size_t gpu_id = 1; */ std::lock_guard<aggregation_mutex_t> guard(instance()[gpu_id].pool_mutex); @@ -1106,7 +1111,7 @@ class aggregation_pool { new aggregation_pool[max_number_gpus]}; return pool_instances; } - static inline size_t number_devices = max_number_gpus; + static inline size_t number_devices = 1; static inline bool is_initialized = false; aggregation_pool() = default; diff --git a/include/buffer_manager.hpp b/include/buffer_manager.hpp index ab4b5144..ea04160b 100644 --- a/include/buffer_manager.hpp +++ b/include/buffer_manager.hpp @@ -771,9 +771,9 @@ template <typename T, typename Host_Allocator> struct recycle_allocator { } #else recycle_allocator() noexcept - : dealloc_hint(hpx::get_worker_thread_num()), device_id(0) {} + : dealloc_hint(hpx::get_worker_thread_num() % number_instances), device_id(0) {} explicit recycle_allocator(const size_t device_id) noexcept - : dealloc_hint(hpx::get_worker_thread_num()), device_id(device_id) {} + : dealloc_hint(hpx::get_worker_thread_num() % number_instances), device_id(device_id) {} explicit recycle_allocator(const size_t device_i, const size_t location_id) noexcept : dealloc_hint(location_id), device_id(device_id) {} explicit recycle_allocator( @@ -781,7 +781,7 @@ template <typename T, typename Host_Allocator> struct recycle_allocator { : dealloc_hint(other.dealloc_hint), device_id(other.device_id) {} T *allocate(std::size_t n) { T *data = buffer_recycler::get<T, Host_Allocator>( - n, false, hpx::get_worker_thread_num(), device_id); + n, false, hpx::get_worker_thread_num() % number_instances, device_id); return data; } void deallocate(T *p, std::size_t n) { @@ -842,9 +842,9 @@ struct aggressive_recycle_allocator { } #else aggressive_recycle_allocator() noexcept - : dealloc_hint(hpx::get_worker_thread_num()), device_id(0) {} + : dealloc_hint(hpx::get_worker_thread_num() % number_instances), device_id(0) {} explicit aggressive_recycle_allocator(const size_t device_id) noexcept - : dealloc_hint(hpx::get_worker_thread_num()), device_id(device_id) {} + : dealloc_hint(hpx::get_worker_thread_num() % number_instances), device_id(device_id) {} explicit aggressive_recycle_allocator(const size_t device_id, const size_t location_id) noexcept : dealloc_hint(location_id), device_id(device_id) {} explicit aggressive_recycle_allocator( diff --git a/include/detail/config.hpp b/include/detail/config.hpp index 7dd8837f..9b425f8b 100644 --- a/include/detail/config.hpp +++ b/include/detail/config.hpp @@ -48,16 +48,15 @@ constexpr size_t max_number_gpus = CPPUDDLE_MAX_NUMBER_GPUS; #ifndef CPPUDDLE_HAVE_HPX static_assert(max_number_gpus == 1, "Non HPX builds do not support multigpu"); #endif -static_assert(number_instances >= max_number_gpus); +//static_assert(number_instances >= max_number_gpus); static_assert(max_number_gpus > 0); -static_assert(number_instances > 0); //constexpr size_t instances_per_gpu = number_instances / max_number_gpus; /// Uses HPX thread information to determine which GPU should be used inline size_t get_device_id(const size_t number_gpus) { #if defined(CPPUDDLE_HAVE_HPX) assert(number_gpus < max_number_gpus); - return hpx::get_worker_thread_num() / number_gpus; + return hpx::get_worker_thread_num() % number_gpus; #else return 0; #endif diff --git a/include/stream_manager.hpp b/include/stream_manager.hpp index a10229a2..8be13b32 100644 --- a/include/stream_manager.hpp +++ b/include/stream_manager.hpp @@ -60,9 +60,10 @@ template <class Interface> class round_robin_pool { return *( std::min_element(std::begin(ref_counters), std::end(ref_counters))); } - size_t get_next_device_id() { - return 0; // single gpu pool - } + // TODO Remove + /* size_t get_next_device_id() { */ + /* return 0; // single gpu pool */ + /* } */ }; template <class Interface> class priority_pool { @@ -103,9 +104,10 @@ template <class Interface> class priority_pool { return ref_counters[priorities[0]] < load_limit; } size_t get_current_load() { return ref_counters[priorities[0]]; } - size_t get_next_device_id() { - return 0; // single gpu pool - } + // TODO remove + /* size_t get_next_device_id() { */ + /* return 0; // single gpu pool */ + /* } */ }; /// Access/Concurrency Control for stream pool implementation @@ -130,28 +132,28 @@ class stream_pool { stream_pool_implementation<Interface, Pool>::cleanup(); } template <class Interface, class Pool> - static std::tuple<Interface &, size_t> get_interface(const size_t gpu_id = get_device_id()) { + static std::tuple<Interface &, size_t> get_interface(const size_t gpu_id) { return stream_pool_implementation<Interface, Pool>::get_interface(gpu_id); } template <class Interface, class Pool> - static void release_interface(size_t index, const size_t gpu_id = get_device_id()) noexcept { + static void release_interface(size_t index, const size_t gpu_id) noexcept { stream_pool_implementation<Interface, Pool>::release_interface(index, gpu_id); } template <class Interface, class Pool> - static bool interface_available(size_t load_limit) noexcept { + static bool interface_available(size_t load_limit, const size_t gpu_id) noexcept { return stream_pool_implementation<Interface, Pool>::interface_available( - load_limit, get_device_id()); + load_limit, gpu_id); } template <class Interface, class Pool> - static size_t get_current_load() noexcept { + static size_t get_current_load(const size_t gpu_id = 0) noexcept { return stream_pool_implementation<Interface, Pool>::get_current_load( - get_device_id()); + gpu_id); } - // TODO deprecated! Remove... template <class Interface, class Pool> - static size_t get_next_device_id() noexcept { - return stream_pool_implementation<Interface, Pool>::get_next_device_id(get_device_id()); + static size_t get_next_device_id(const size_t number_gpus) noexcept { + // TODO add round robin and min strategy + return get_device_id(number_gpus); } template <class Interface, class Pool> @@ -232,11 +234,11 @@ class stream_pool { return instance().streampools[gpu_id].get_current_load(); } // TODO deprecated! Remove... - static size_t get_next_device_id(const size_t gpu_id = 0) { - std::lock_guard<mutex_t> guard(instance().gpu_mutexes[gpu_id]); - assert(instance().streampools.size() == max_number_gpus); - return instance().streampools[gpu_id].get_next_device_id(); - } + /* static size_t get_next_device_id(const size_t gpu_id = 0) { */ + /* std::lock_guard<mutex_t> guard(instance().gpu_mutexes[gpu_id]); */ + /* assert(instance().streampools.size() == max_number_gpus); */ + /* return instance().streampools[gpu_id].get_next_device_id(); */ + /* } */ static void set_device_selector(std::function<void(size_t)> select_gpu_function) { auto guard = make_scoped_lock_from_array(instance().gpu_mutexes); @@ -288,16 +290,16 @@ class stream_pool { template <class Interface, class Pool> class stream_interface { public: - stream_interface() - : t(stream_pool::get_interface<Interface, Pool>()), - interface(std::get<0>(t)), interface_index(std::get<1>(t)) {} + explicit stream_interface(size_t gpu_id) + : t(stream_pool::get_interface<Interface, Pool>(gpu_id)), + interface(std::get<0>(t)), interface_index(std::get<1>(t)), gpu_id(gpu_id) {} stream_interface(const stream_interface &other) = delete; stream_interface &operator=(const stream_interface &other) = delete; stream_interface(stream_interface &&other) = delete; stream_interface &operator=(stream_interface &&other) = delete; ~stream_interface() { - stream_pool::release_interface<Interface, Pool>(interface_index); + stream_pool::release_interface<Interface, Pool>(interface_index, gpu_id); } template <typename F, typename... Ts> @@ -318,6 +320,7 @@ template <class Interface, class Pool> class stream_interface { private: std::tuple<Interface &, size_t> t; size_t interface_index; + size_t gpu_id; public: Interface &interface; From 95c77224f1ec08778fecd6d4456b1ae0cf96083c Mon Sep 17 00:00:00 2001 From: Gregor Daiss <Gregor.Daiss+git@gmail.com> Date: Mon, 21 Aug 2023 01:05:35 -0500 Subject: [PATCH 27/42] Adapt hip_device_buffers for multigpu --- include/cuda_buffer_util.hpp | 2 +- include/hip_buffer_util.hpp | 39 ++++++++++-------------------------- 2 files changed, 12 insertions(+), 29 deletions(-) diff --git a/include/cuda_buffer_util.hpp b/include/cuda_buffer_util.hpp index e454d8c0..55d3397a 100644 --- a/include/cuda_buffer_util.hpp +++ b/include/cuda_buffer_util.hpp @@ -136,7 +136,7 @@ template <typename T, typename Host_Allocator, std::enable_if_t<std::is_trivial< struct cuda_aggregated_device_buffer { T *device_side_buffer; size_t number_of_elements; - explicit cuda_aggregated_device_buffer(size_t number_of_elements, Host_Allocator &alloc) + cuda_aggregated_device_buffer(size_t number_of_elements, Host_Allocator &alloc) : number_of_elements(number_of_elements), alloc(alloc) { device_side_buffer = alloc.allocate(number_of_elements); diff --git a/include/hip_buffer_util.hpp b/include/hip_buffer_util.hpp index c33566e5..7942792b 100644 --- a/include/hip_buffer_util.hpp +++ b/include/hip_buffer_util.hpp @@ -22,7 +22,6 @@ template <class T> struct hip_pinned_allocator { template <class U> explicit hip_pinned_allocator(hip_pinned_allocator<U> const &) noexcept {} T *allocate(std::size_t n) { - hipSetDevice(get_device_id()); T *data; // hipError_t error = // hipMallocHost(reinterpret_cast<void **>(&data), n * sizeof(T)); @@ -69,7 +68,6 @@ template <class T> struct hip_device_allocator { template <class U> explicit hip_device_allocator(hip_device_allocator<U> const &) noexcept {} T *allocate(std::size_t n) { - hipSetDevice(get_device_id()); T *data; hipError_t error = hipMalloc(&data, n * sizeof(T)); if (error != hipSuccess) { @@ -115,22 +113,18 @@ using recycle_allocator_hip_device = // TODO Is this even required? (cuda version should work fine...) template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0> struct hip_device_buffer { - size_t gpu_id{0}; + recycle_allocator_hip_device<T> allocator; T *device_side_buffer; size_t number_of_elements; - explicit hip_device_buffer(size_t number_of_elements) - : number_of_elements(number_of_elements) { - device_side_buffer = - recycle_allocator_hip_device<T>{}.allocate(number_of_elements); - } - explicit hip_device_buffer(size_t number_of_elements, size_t gpu_id) - : gpu_id(gpu_id), number_of_elements(number_of_elements), set_id(true) { + + hip_device_buffer(size_t number_of_elements, size_t device_id) + : allocator{device_id}, number_of_elements(number_of_elements) { + assert(device_id < max_number_gpus); device_side_buffer = recycle_allocator_hip_device<T>{}.allocate(number_of_elements); } ~hip_device_buffer() { - recycle_allocator_hip_device<T>{}.deallocate(device_side_buffer, - number_of_elements); + allocator.deallocate(device_side_buffer, number_of_elements); } // not yet implemented hip_device_buffer(hip_device_buffer const &other) = delete; @@ -138,30 +132,19 @@ struct hip_device_buffer { hip_device_buffer(hip_device_buffer const &&other) = delete; hip_device_buffer operator=(hip_device_buffer const &&other) = delete; -private: - bool set_id{false}; }; template <typename T, typename Host_Allocator, std::enable_if_t<std::is_trivial<T>::value, int> = 0> struct hip_aggregated_device_buffer { - size_t gpu_id{0}; T *device_side_buffer; size_t number_of_elements; - explicit hip_aggregated_device_buffer(size_t number_of_elements) - : number_of_elements(number_of_elements) { - device_side_buffer = - recycle_allocator_hip_device<T>{}.allocate(number_of_elements); - } - explicit hip_aggregated_device_buffer(size_t number_of_elements, size_t gpu_id, Host_Allocator &alloc) - : gpu_id(gpu_id), number_of_elements(number_of_elements), set_id(true), alloc(alloc) { - assert(gpu_id == 0); + hip_aggregated_device_buffer(size_t number_of_elements, Host_Allocator &alloc) + : number_of_elements(number_of_elements), alloc(alloc) { device_side_buffer = alloc.allocate(number_of_elements); } ~hip_aggregated_device_buffer() { - assert(gpu_id == 0); - alloc.deallocate(device_side_buffer, - number_of_elements); + alloc.deallocate(device_side_buffer, number_of_elements); } // not yet implemented hip_aggregated_device_buffer(hip_aggregated_device_buffer const &other) = delete; @@ -170,8 +153,8 @@ struct hip_aggregated_device_buffer { hip_aggregated_device_buffer operator=(hip_aggregated_device_buffer const &&other) = delete; private: - bool set_id{false}; - Host_Allocator &alloc; + Host_Allocator &alloc; // will stay valid for the entire aggregation region and hence + // for the entire lifetime of this buffer }; } // end namespace recycler From 07f3c4187b5057729848e0c370c4f72912601553 Mon Sep 17 00:00:00 2001 From: Gregor Daiss <Gregor.Daiss+git@gmail.com> Date: Mon, 21 Aug 2023 11:37:22 -0500 Subject: [PATCH 28/42] Fix missing hip device selectors --- include/hip_buffer_util.hpp | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/include/hip_buffer_util.hpp b/include/hip_buffer_util.hpp index 7942792b..adc846d0 100644 --- a/include/hip_buffer_util.hpp +++ b/include/hip_buffer_util.hpp @@ -157,5 +157,16 @@ struct hip_aggregated_device_buffer { // for the entire lifetime of this buffer }; +namespace device_selection { +template <typename T> +struct select_device_functor<T, detail::hip_pinned_allocator<T>> { + void operator()(const size_t device_id) { hipSetDevice(device_id); } +}; +template <typename T> +struct select_device_functor<T, detail::hip_device_allocator<T>> { + void operator()(const size_t device_id) { hipSetDevice(device_id); } +}; +} // namespace device_selection + } // end namespace recycler #endif From 8b4d15b5b158ad26c3de2346c70687dac09db1bd Mon Sep 17 00:00:00 2001 From: Gregor Daiss <Gregor.Daiss+git@gmail.com> Date: Mon, 21 Aug 2023 11:37:37 -0500 Subject: [PATCH 29/42] Add exception warnings Should not impact non-exception performance due to zero cost abstracions in gcc/clang --- include/buffer_manager.hpp | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/include/buffer_manager.hpp b/include/buffer_manager.hpp index ea04160b..32e751d9 100644 --- a/include/buffer_manager.hpp +++ b/include/buffer_manager.hpp @@ -93,16 +93,28 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL static T *get(size_t number_elements, bool manage_content_lifetime = false, std::optional<size_t> location_hint = std::nullopt, std::optional<size_t> device_id = std::nullopt) { - return buffer_manager<T, Host_Allocator>::get( - number_elements, manage_content_lifetime, location_hint, device_id); + try { + return buffer_manager<T, Host_Allocator>::get( + number_elements, manage_content_lifetime, location_hint, device_id); + } catch (const std::exception &exc) { + std::cerr << "ERROR: Encountered unhandled exception in cppuddle get: " << exc.what() << std::endl; + std::cerr << "Rethrowing exception... " << std::endl;; + throw; + } } /// Marks an buffer as unused and fit for reusage template <typename T, typename Host_Allocator> static void mark_unused(T *p, size_t number_elements, std::optional<size_t> location_hint = std::nullopt, std::optional<size_t> device_id = std::nullopt) { - return buffer_manager<T, Host_Allocator>::mark_unused(p, number_elements, - location_hint, device_id); + try { + return buffer_manager<T, Host_Allocator>::mark_unused(p, number_elements, + location_hint, device_id); + } catch (const std::exception &exc) { + std::cerr << "ERROR: Encountered unhandled exception in cppuddle mark_unused: " << exc.what() << std::endl; + std::cerr << "Rethrowing exception... " << std::endl;; + throw; + } } #endif template <typename T, typename Host_Allocator> From 9fa7e440cf1a2e1270e6af9f9333def6bc436ec9 Mon Sep 17 00:00:00 2001 From: Gregor Daiss <Gregor.Daiss+git@gmail.com> Date: Mon, 21 Aug 2023 17:21:55 -0500 Subject: [PATCH 30/42] Fix hip device side buffer --- include/hip_buffer_util.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/hip_buffer_util.hpp b/include/hip_buffer_util.hpp index adc846d0..e2364095 100644 --- a/include/hip_buffer_util.hpp +++ b/include/hip_buffer_util.hpp @@ -121,7 +121,7 @@ struct hip_device_buffer { : allocator{device_id}, number_of_elements(number_of_elements) { assert(device_id < max_number_gpus); device_side_buffer = - recycle_allocator_hip_device<T>{}.allocate(number_of_elements); + allocator.allocate(number_of_elements); } ~hip_device_buffer() { allocator.deallocate(device_side_buffer, number_of_elements); From 4ba470aa5ac72148c31d1e2d4ecae32196d18b5b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= <Gregor.Daiss+git@gmail.com> Date: Tue, 22 Aug 2023 14:35:28 -0500 Subject: [PATCH 31/42] Get stuff out of the global namespace --- include/aggregation_manager.hpp | 58 ++++++++++++++++----------------- include/detail/config.hpp | 17 +++++++--- include/stream_manager.hpp | 36 ++++++++++---------- 3 files changed, 60 insertions(+), 51 deletions(-) diff --git a/include/aggregation_manager.hpp b/include/aggregation_manager.hpp index 35338bfc..c297056c 100644 --- a/include/aggregation_manager.hpp +++ b/include/aggregation_manager.hpp @@ -144,7 +144,7 @@ template <typename Executor> class aggregated_function_call { std::any function_tuple; /// Stores the string of the first function call for debug output std::string debug_type_information; - aggregation_mutex_t debug_mut; + recycler::aggregation_mutex_t debug_mut; #endif std::vector<hpx::lcos::local::promise<void>> potential_async_promises{}; @@ -175,7 +175,7 @@ template <typename Executor> class aggregated_function_call { #if !(defined(NDEBUG)) && defined(DEBUG_AGGREGATION_CALLS) // needed for concurrent access to function_tuple and debug_type_information // Not required for normal use - std::lock_guard<aggregation_mutex_t> guard(debug_mut); + std::lock_guard<recycler::aggregation_mutex_t> guard(debug_mut); #endif assert(!async_mode); assert(potential_async_promises.empty()); @@ -249,7 +249,7 @@ template <typename Executor> class aggregated_function_call { #if !(defined(NDEBUG)) && defined(DEBUG_AGGREGATION_CALLS) // needed for concurrent access to function_tuple and debug_type_information // Not required for normal use - std::lock_guard<aggregation_mutex_t> guard(debug_mut); + std::lock_guard<recycler::aggregation_mutex_t> guard(debug_mut); #endif assert(async_mode); assert(!potential_async_promises.empty()); @@ -530,7 +530,7 @@ template <typename Executor> class Aggregated_Executor { /// slices have called it std::deque<aggregated_function_call<Executor>> function_calls; /// For synchronizing the access to the function calls list - aggregation_mutex_t mut; + recycler::aggregation_mutex_t mut; /// Data entry for a buffer allocation: void* pointer, size_t for /// buffer-size, atomic for the slice counter, location_id, gpu_id @@ -541,7 +541,7 @@ template <typename Executor> class Aggregated_Executor { /// Map pointer to deque index for fast access in the deallocations std::unordered_map<void*,size_t> buffer_allocations_map; /// For synchronizing the access to the buffer_allocations - aggregation_mutex_t buffer_mut; + recycler::aggregation_mutex_t buffer_mut; std::atomic<size_t> buffer_counter = 0; /// Get new buffer OR get buffer already allocated by different slice @@ -553,7 +553,7 @@ template <typename Executor> class Aggregated_Executor { // First: Check if it already has happened if (buffer_counter <= slice_alloc_counter) { // we might be the first! Lock... - std::lock_guard<aggregation_mutex_t> guard(buffer_mut); + std::lock_guard<recycler::aggregation_mutex_t> guard(buffer_mut); // ... and recheck if (buffer_counter <= slice_alloc_counter) { constexpr bool manage_content_lifetime = false; @@ -563,7 +563,7 @@ template <typename Executor> class Aggregated_Executor { // many different buffers for different aggregation sizes on different GPUs /* size_t location_id = gpu_id * instances_per_gpu; */ // Use integer conversion to only use 0 16 32 ... as buckets - size_t location_id = ((hpx::get_worker_thread_num() % number_instances) / 16) * 16; + size_t location_id = ((hpx::get_worker_thread_num() % recycler::number_instances) / 16) * 16; #ifdef CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS if (max_slices == 1) { // get prefered location: aka the current hpx threads location @@ -648,7 +648,7 @@ template <typename Executor> class Aggregated_Executor { // Check if all slices are done with this buffer? if (buffer_allocation_counter == 0) { // Yes! "Deallocate" by telling the recylcer the buffer is fit for reusage - std::lock_guard<aggregation_mutex_t> guard(buffer_mut); + std::lock_guard<recycler::aggregation_mutex_t> guard(buffer_mut); // Only mark unused if another buffer has not done so already (and marked // it as invalid) if (valid) { @@ -661,7 +661,7 @@ template <typename Executor> class Aggregated_Executor { const size_t current_deallocs = ++dealloc_counter; if (current_deallocs == buffer_counter) { - std::lock_guard<aggregation_mutex_t> guard(mut); + std::lock_guard<recycler::aggregation_mutex_t> guard(mut); buffers_in_use = false; if (!executor_slices_alive && !buffers_in_use) slices_exhausted = false; @@ -679,11 +679,11 @@ template <typename Executor> class Aggregated_Executor { /// Only meant to be accessed by the slice executors bool sync_aggregation_slices(const size_t slice_launch_counter) { - std::lock_guard<aggregation_mutex_t> guard(mut); + std::lock_guard<recycler::aggregation_mutex_t> guard(mut); assert(slices_exhausted == true); // Add function call object in case it hasn't happened for this launch yet if (overall_launch_counter <= slice_launch_counter) { - /* std::lock_guard<aggregation_mutex_t> guard(mut); */ + /* std::lock_guard<recycler::aggregation_mutex_t> guard(mut); */ if (overall_launch_counter <= slice_launch_counter) { function_calls.emplace_back(current_slices, false, executor); overall_launch_counter = function_calls.size(); @@ -699,11 +699,11 @@ template <typename Executor> class Aggregated_Executor { /// Only meant to be accessed by the slice executors template <typename F, typename... Ts> void post(const size_t slice_launch_counter, F &&f, Ts &&...ts) { - std::lock_guard<aggregation_mutex_t> guard(mut); + std::lock_guard<recycler::aggregation_mutex_t> guard(mut); assert(slices_exhausted == true); // Add function call object in case it hasn't happened for this launch yet if (overall_launch_counter <= slice_launch_counter) { - /* std::lock_guard<aggregation_mutex_t> guard(mut); */ + /* std::lock_guard<recycler::aggregation_mutex_t> guard(mut); */ if (overall_launch_counter <= slice_launch_counter) { function_calls.emplace_back(current_slices, false, executor); overall_launch_counter = function_calls.size(); @@ -722,11 +722,11 @@ template <typename Executor> class Aggregated_Executor { template <typename F, typename... Ts> hpx::lcos::future<void> async(const size_t slice_launch_counter, F &&f, Ts &&...ts) { - std::lock_guard<aggregation_mutex_t> guard(mut); + std::lock_guard<recycler::aggregation_mutex_t> guard(mut); assert(slices_exhausted == true); // Add function call object in case it hasn't happened for this launch yet if (overall_launch_counter <= slice_launch_counter) { - /* std::lock_guard<aggregation_mutex_t> guard(mut); */ + /* std::lock_guard<recycler::aggregation_mutex_t> guard(mut); */ if (overall_launch_counter <= slice_launch_counter) { function_calls.emplace_back(current_slices, true, executor); overall_launch_counter = function_calls.size(); @@ -742,11 +742,11 @@ template <typename Executor> class Aggregated_Executor { template <typename F, typename... Ts> hpx::lcos::shared_future<void> wrap_async(const size_t slice_launch_counter, F &&f, Ts &&...ts) { - std::lock_guard<aggregation_mutex_t> guard(mut); + std::lock_guard<recycler::aggregation_mutex_t> guard(mut); assert(slices_exhausted == true); // Add function call object in case it hasn't happened for this launch yet if (overall_launch_counter <= slice_launch_counter) { - /* std::lock_guard<aggregation_mutex_t> guard(mut); */ + /* std::lock_guard<recycler::aggregation_mutex_t> guard(mut); */ if (overall_launch_counter <= slice_launch_counter) { function_calls.emplace_back(current_slices, true, executor); overall_launch_counter = function_calls.size(); @@ -760,12 +760,12 @@ template <typename Executor> class Aggregated_Executor { } bool slice_available(void) { - std::lock_guard<aggregation_mutex_t> guard(mut); + std::lock_guard<recycler::aggregation_mutex_t> guard(mut); return !slices_exhausted; } std::optional<hpx::lcos::future<Executor_Slice>> request_executor_slice() { - std::lock_guard<aggregation_mutex_t> guard(mut); + std::lock_guard<recycler::aggregation_mutex_t> guard(mut); if (!slices_exhausted) { const size_t local_slice_id = ++current_slices; if (local_slice_id == 1) { @@ -773,7 +773,7 @@ template <typename Executor> class Aggregated_Executor { // TODO still required? Should be clean here already function_calls.clear(); overall_launch_counter = 0; - std::lock_guard<aggregation_mutex_t> guard(buffer_mut); + std::lock_guard<recycler::aggregation_mutex_t> guard(buffer_mut); #ifndef NDEBUG for (const auto &buffer_entry : buffer_allocations) { const auto &[buffer_pointer_any, buffer_size, @@ -831,7 +831,7 @@ template <typename Executor> class Aggregated_Executor { } // Launch all executor slices within this continuation current_continuation = fut.then([this](auto &&fut) { - std::lock_guard<aggregation_mutex_t> guard(mut); + std::lock_guard<recycler::aggregation_mutex_t> guard(mut); slices_exhausted = true; launched_slices = current_slices; size_t id = 0; @@ -868,7 +868,7 @@ template <typename Executor> class Aggregated_Executor { } size_t launched_slices; void reduce_usage_counter(void) { - /* std::lock_guard<aggregation_mutex_t> guard(mut); */ + /* std::lock_guard<recycler::aggregation_mutex_t> guard(mut); */ assert(slices_exhausted == true); assert(executor_slices_alive == true); assert(launched_slices >= 1); @@ -885,7 +885,7 @@ template <typename Executor> class Aggregated_Executor { // std::get<0>(executor_tuple); // Mark executor fit for reusage - std::lock_guard<aggregation_mutex_t> guard(mut); + std::lock_guard<recycler::aggregation_mutex_t> guard(mut); executor_slices_alive = false; if (!executor_slices_alive && !buffers_in_use) { slices_exhausted = false; @@ -1021,7 +1021,7 @@ class aggregation_pool { std::string("Trying to initialize cppuddle aggregation pool twice") + " Agg pool name: " + std::string(kernelname)); } - if (num_devices > max_number_gpus) { + if (num_devices > recycler::max_number_gpus) { throw std::runtime_error( std::string( "Trying to initialize aggregation with more devices than the " @@ -1031,7 +1031,7 @@ class aggregation_pool { number_devices = num_devices; for (size_t gpu_id = 0; gpu_id < number_devices; gpu_id++) { - std::lock_guard<aggregation_mutex_t> guard(instance()[gpu_id].pool_mutex); + std::lock_guard<recycler::aggregation_mutex_t> guard(instance()[gpu_id].pool_mutex); assert(instance()[gpu_id].aggregation_executor_pool.empty()); for (int i = 0; i < number_of_executors; i++) { instance()[gpu_id].aggregation_executor_pool.emplace_back(slices_per_executor, @@ -1050,9 +1050,9 @@ class aggregation_pool { std::string("Trying to use cppuddle aggregation pool without first calling init") + " Agg poolname: " + std::string(kernelname)); } - const size_t gpu_id = get_device_id(number_devices); + const size_t gpu_id = recycler::get_device_id(number_devices); /* const size_t gpu_id = 1; */ - std::lock_guard<aggregation_mutex_t> guard(instance()[gpu_id].pool_mutex); + std::lock_guard<recycler::aggregation_mutex_t> guard(instance()[gpu_id].pool_mutex); assert(!instance()[gpu_id].aggregation_executor_pool.empty()); std::optional<hpx::lcos::future< typename Aggregated_Executor<Interface>::Executor_Slice>> @@ -1104,11 +1104,11 @@ class aggregation_pool { private: /// Required for dealing with adding elements to the deque of /// aggregated_executors - aggregation_mutex_t pool_mutex; + recycler::aggregation_mutex_t pool_mutex; /// Global access instance static std::unique_ptr<aggregation_pool[]>& instance(void) { static std::unique_ptr<aggregation_pool[]> pool_instances{ - new aggregation_pool[max_number_gpus]}; + new aggregation_pool[recycler::max_number_gpus]}; return pool_instances; } static inline size_t number_devices = 1; diff --git a/include/detail/config.hpp b/include/detail/config.hpp index 9b425f8b..e118e884 100644 --- a/include/detail/config.hpp +++ b/include/detail/config.hpp @@ -6,16 +6,13 @@ #ifndef CPPUDDLE_CONFIG_HPP #define CPPUDDLE_CONFIG_HPP + // Mutex configuration // #if defined(CPPUDDLE_HAVE_HPX) && defined(CPPUDDLE_HAVE_HPX_MUTEX) #include <hpx/mutex.hpp> -using mutex_t = hpx::spinlock_no_backoff; -using aggregation_mutex_t = hpx::mutex; #else #include <mutex> -using mutex_t = std::mutex; -using aggregation_mutex_t = std::mutex; #endif // HPX-aware configuration @@ -31,6 +28,16 @@ For better performance configure CPPuddle with CPPUDDLE_WITH_HPX_AWARE_ALLOCATOR #endif #endif +namespace recycler { + +#if defined(CPPUDDLE_HAVE_HPX) && defined(CPPUDDLE_HAVE_HPX_MUTEX) +using mutex_t = hpx::spinlock_no_backoff; +using aggregation_mutex_t = hpx::mutex; +#else +using mutex_t = std::mutex; +using aggregation_mutex_t = std::mutex; +#endif + // Recycling configuration // TODO Add warnings here @@ -62,4 +69,6 @@ inline size_t get_device_id(const size_t number_gpus) { #endif } +} // end namespace recycler + #endif diff --git a/include/stream_manager.hpp b/include/stream_manager.hpp index 8be13b32..4174af5c 100644 --- a/include/stream_manager.hpp +++ b/include/stream_manager.hpp @@ -153,7 +153,7 @@ class stream_pool { template <class Interface, class Pool> static size_t get_next_device_id(const size_t number_gpus) noexcept { // TODO add round robin and min strategy - return get_device_id(number_gpus); + return recycler::get_device_id(number_gpus); } template <class Interface, class Pool> @@ -175,7 +175,7 @@ class stream_pool { /// Deprecated! Use init_on_all_gpu or init_on_gpu template <typename... Ts> static void init(size_t number_of_streams, Ts ... executor_args) { - /* static_assert(sizeof...(Ts) == sizeof...(Ts) && max_number_gpus == 1, */ + /* static_assert(sizeof...(Ts) == sizeof...(Ts) && recycler::max_number_gpus == 1, */ /* "deprecated stream_pool::init does not support multigpu"); */ auto guard = make_scoped_lock_from_array(instance().gpu_mutexes); instance().streampools.emplace_back(number_of_streams, executor_args...); @@ -186,7 +186,7 @@ class stream_pool { static void init_all_executor_pools(size_t number_of_streams, Ts ... executor_args) { auto guard = make_scoped_lock_from_array(instance().gpu_mutexes); if (number_of_streams > 0) { - for (size_t gpu_id = 0; gpu_id < max_number_gpus; gpu_id++) { + for (size_t gpu_id = 0; gpu_id < recycler::max_number_gpus; gpu_id++) { instance().select_gpu_function(gpu_id); instance().streampools.emplace_back(number_of_streams, executor_args...); @@ -209,40 +209,40 @@ class stream_pool { // TODO add/rename into finalize? static void cleanup() { auto guard = make_scoped_lock_from_array(instance().gpu_mutexes); - assert(instance().streampools.size() == max_number_gpus); + assert(instance().streampools.size() == recycler::max_number_gpus); instance().streampools.clear(); } static std::tuple<Interface &, size_t> get_interface(const size_t gpu_id = 0) { - std::lock_guard<mutex_t> guard(instance().gpu_mutexes[gpu_id]); - assert(instance().streampools.size() == max_number_gpus); + std::lock_guard<recycler::mutex_t> guard(instance().gpu_mutexes[gpu_id]); + assert(instance().streampools.size() == recycler::max_number_gpus); return instance().streampools[gpu_id].get_interface(); } static void release_interface(size_t index, const size_t gpu_id = 0) { - std::lock_guard<mutex_t> guard(instance().gpu_mutexes[gpu_id]); - assert(instance().streampools.size() == max_number_gpus); + std::lock_guard<recycler::mutex_t> guard(instance().gpu_mutexes[gpu_id]); + assert(instance().streampools.size() == recycler::max_number_gpus); instance().streampools[gpu_id].release_interface(index); } static bool interface_available(size_t load_limit, const size_t gpu_id = 0) { - std::lock_guard<mutex_t> guard(instance().gpu_mutexes[gpu_id]); - assert(instance().streampools.size() == max_number_gpus); + std::lock_guard<recycler::mutex_t> guard(instance().gpu_mutexes[gpu_id]); + assert(instance().streampools.size() == recycler::max_number_gpus); return instance().streampools[gpu_id].interface_available(load_limit); } static size_t get_current_load(const size_t gpu_id = 0) { - std::lock_guard<mutex_t> guard(instance().gpu_mutexes[gpu_id]); - assert(instance().streampools.size() == max_number_gpus); + std::lock_guard<recycler::mutex_t> guard(instance().gpu_mutexes[gpu_id]); + assert(instance().streampools.size() == recycler::max_number_gpus); return instance().streampools[gpu_id].get_current_load(); } // TODO deprecated! Remove... /* static size_t get_next_device_id(const size_t gpu_id = 0) { */ - /* std::lock_guard<mutex_t> guard(instance().gpu_mutexes[gpu_id]); */ - /* assert(instance().streampools.size() == max_number_gpus); */ + /* std::lock_guard<recycler::mutex_t> guard(instance().gpu_mutexes[gpu_id]); */ + /* assert(instance().streampools.size() == recycler::max_number_gpus); */ /* return instance().streampools[gpu_id].get_next_device_id(); */ /* } */ static void set_device_selector(std::function<void(size_t)> select_gpu_function) { auto guard = make_scoped_lock_from_array(instance().gpu_mutexes); - assert(instance().streampools.size() == max_number_gpus); + assert(instance().streampools.size() == recycler::max_number_gpus); instance().select_gpu_function = select_gpu_function; } @@ -252,15 +252,15 @@ class stream_pool { private: stream_pool_implementation() = default; - mutex_t pool_mut{}; + recycler::mutex_t pool_mut{}; std::function<void(size_t)> select_gpu_function = [](size_t gpu_id) { // By default no multi gpu support - assert(max_number_gpus == 1); + assert(recycler::max_number_gpus == 1); assert(gpu_id == 0); }; std::deque<Pool> streampools{}; - std::array<mutex_t, max_number_gpus> gpu_mutexes; + std::array<recycler::mutex_t, recycler::max_number_gpus> gpu_mutexes; static stream_pool_implementation& instance(void) { static stream_pool_implementation pool_instance{}; From be375086b6a9e350234b5f55fea938648b1a9041 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= <Gregor.Daiss+git@gmail.com> Date: Tue, 22 Aug 2023 15:33:23 -0500 Subject: [PATCH 32/42] Improve cmake variable names --- CMakeLists.txt | 32 ++++++++++++++++---------------- include/detail/config.hpp | 4 ++-- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index fb38b230..5ff430d4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -23,13 +23,13 @@ set(CPPUDDLE_VERSION_STRING "${CPPUDDLE_VERSION_MAJOR}.${CPPUDDLE_VERSION_MINOR} option(CPPUDDLE_WITH_CUDA "Enable CUDA tests/examples" OFF) option(CPPUDDLE_WITH_MULTIGPU_SUPPORT "Enables experimental MultiGPU support" OFF) option(CPPUDDLE_WITH_KOKKOS "Enable KOKKOS tests/examples" OFF) -set(CPPUDDLE_WITH_NUMBER_GPUS "1" CACHE STRING "Number of GPUs that will be used. Should match the number of GPUs used when using the maximum number of HPX worker threads. Should be 1 for non-HPX builds.") +set(CPPUDDLE_WITH_MAX_NUMBER_GPUS "1" CACHE STRING "Number of GPUs that will be used. Should match the number of GPUs used when using the maximum number of HPX worker threads. Should be 1 for non-HPX builds.") # HPX-related options option(CPPUDDLE_WITH_HPX "Enable basic HPX integration and examples" OFF) option(CPPUDDLE_WITH_HPX_AWARE_ALLOCATORS "Enable HPX-aware allocators for even better HPX integration" ON) set(CPPUDDLE_WITH_HPX_MUTEX OFF CACHE BOOL "Use HPX spinlock mutex instead of std::mutex") -set(CPPUDDLE_WITH_MAX_NUMBER_WORKERS "128" CACHE STRING "Max number of workers threads supported. Should match the intended number of HPX workers or be 1 in non-HPX builds.") +set(CPPUDDLE_WITH_NUMBER_BUCKETS "128" CACHE STRING "Number of internal recycle buckets buffer type. Should ideally match the intended number of HPX workers or be 1 in non-HPX builds.") # Test-related options option(CPPUDDLE_WITH_COUNTERS "Turns on allocations counters. Useful for extended testing" OFF) option(CPPUDDLE_WITH_TESTS "Build tests/examples" OFF) @@ -69,10 +69,10 @@ if(CPPUDDLE_WITH_NUMBER_GPUS GREATER 1) endif() endif() -if(CPPUDDLE_WITH_MAX_NUMBER_WORKERS GREATER 1) +if(CPPUDDLE_WITH_NUMBER_BUCKETS GREATER 1) if(NOT CPPUDDLE_WITH_HPX_AWARE_ALLOCATORS) message(FATAL_ERROR " CPPUDDLE_WITH_HPX_AWARE_ALLOCATORS=ON is required for Multi-Worker build! \ - Either turn it on or configure with CPPUDDLE_WITH_MAX_NUMBER_WORKERS=1 !") + Either turn it on or configure with CPPUDDLE_WITH_NUMBER_BUCKETS=1 !") endif() endif() @@ -164,15 +164,15 @@ if (CPPUDDLE_WITH_HPX) if(CPPUDDLE_WITH_HPX_AWARE_ALLOCATORS) message(INFO " Compiling with HPX-aware allocators!") target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS") - target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_MAX_NUMBER_GPUS=${CPPUDDLE_WITH_NUMBER_GPUS}") - target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_MAX_NUMBER_WORKERS=${CPPUDDLE_WITH_MAX_NUMBER_WORKERS}") + target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_HAVE_MAX_NUMBER_GPUS=${CPPUDDLE_WITH_MAX_NUMBER_GPUS}") + target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_HAVE_NUMBER_BUCKETS=${CPPUDDLE_WITH_NUMBER_BUCKETS}") else() - target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_MAX_NUMBER_GPUS=1") - target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_MAX_NUMBER_WORKERS=1") + target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_HAVE_MAX_NUMBER_GPUS=1") + target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_HAVE_NUMBER_BUCKETS=1") endif() else() - target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_MAX_NUMBER_GPUS=1") - target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_MAX_NUMBER_WORKERS=1") + target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_HAVE_MAX_NUMBER_GPUS=1") + target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_HAVE_NUMBER_BUCKETS=1") endif() if (CPPUDDLE_WITH_COUNTERS) target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_HAVE_COUNTERS") @@ -187,15 +187,15 @@ if (CPPUDDLE_WITH_HPX) target_compile_definitions(stream_manager INTERFACE "CPPUDDLE_HAVE_HPX") if(CPPUDDLE_WITH_HPX_AWARE_ALLOCATORS) target_compile_definitions(stream_manager INTERFACE "CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS") - target_compile_definitions(stream_manager INTERFACE "CPPUDDLE_MAX_NUMBER_GPUS=${CPPUDDLE_WITH_NUMBER_GPUS}") - target_compile_definitions(stream_manager INTERFACE "CPPUDDLE_MAX_NUMBER_WORKERS=${CPPUDDLE_WITH_MAX_NUMBER_WORKERS}") + target_compile_definitions(stream_manager INTERFACE "CPPUDDLE_HAVE_MAX_NUMBER_GPUS=${CPPUDDLE_WITH_MAX_NUMBER_GPUS}") + target_compile_definitions(stream_manager INTERFACE "CPPUDDLE_HAVE_NUMBER_BUCKETS=${CPPUDDLE_WITH_NUMBER_BUCKETS}") else() - target_compile_definitions(stream_manager INTERFACE "CPPUDDLE_MAX_NUMBER_GPUS=1") - target_compile_definitions(stream_manager INTERFACE "CPPUDDLE_MAX_NUMBER_WORKERS=1") + target_compile_definitions(stream_manager INTERFACE "CPPUDDLE_HAVE_MAX_NUMBER_GPUS=1") + target_compile_definitions(stream_manager INTERFACE "CPPUDDLE_HAVE_NUMBER_BUCKETS=1") endif() else() - target_compile_definitions(stream_manager INTERFACE "CPPUDDLE_MAX_NUMBER_GPUS=1") - target_compile_definitions(stream_manager INTERFACE "CPPUDDLE_MAX_NUMBER_WORKERS=1") + target_compile_definitions(stream_manager INTERFACE "CPPUDDLE_HAVE_MAX_NUMBER_GPUS=1") + target_compile_definitions(stream_manager INTERFACE "CPPUDDLE_HAVE_NUMBER_BUCKETS=1") endif() if (CPPUDDLE_WITH_COUNTERS) target_compile_definitions(stream_manager INTERFACE "CPPUDDLE_HAVE_COUNTERS") diff --git a/include/detail/config.hpp b/include/detail/config.hpp index e118e884..b5816cf3 100644 --- a/include/detail/config.hpp +++ b/include/detail/config.hpp @@ -49,9 +49,9 @@ using aggregation_mutex_t = std::mutex; // Thread and MultiGPU configuration // -constexpr size_t number_instances = CPPUDDLE_MAX_NUMBER_WORKERS; +constexpr size_t number_instances = CPPUDDLE_HAVE_NUMBER_BUCKETS; static_assert(number_instances >= 1); -constexpr size_t max_number_gpus = CPPUDDLE_MAX_NUMBER_GPUS; +constexpr size_t max_number_gpus = CPPUDDLE_HAVE_MAX_NUMBER_GPUS; #ifndef CPPUDDLE_HAVE_HPX static_assert(max_number_gpus == 1, "Non HPX builds do not support multigpu"); #endif From 3310dee81e0314a885bad300ce87305e2f12f8ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= <Gregor.Daiss+git@gmail.com> Date: Tue, 22 Aug 2023 15:45:07 -0500 Subject: [PATCH 33/42] Show CI valgrind error list --- CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5ff430d4..1fbb9fda 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -415,12 +415,12 @@ if (CPPUDDLE_WITH_TESTS) find_program(VALGRIND_COMMAND valgrind) if (VALGRIND_COMMAND) add_test(allocator_memcheck.valgrind - ${VALGRIND_COMMAND} --trace-children=yes --leak-check=full ./allocator_test --arraysize 5000000 --passes 200) + ${VALGRIND_COMMAND} --trace-children=yes --leak-check=full --undef-value-errors=no --show-error-list=yes ./allocator_test --arraysize 5000000 --passes 200) set_tests_properties(allocator_memcheck.valgrind PROPERTIES PASS_REGULAR_EXPRESSION "ERROR SUMMARY: 0 errors from 0 contexts" ) add_test(allocator_aligned_memcheck.valgrind - ${VALGRIND_COMMAND} --trace-children=yes --leak-check=full ./allocator_aligned_test --arraysize 5000000 --passes 200) + ${VALGRIND_COMMAND} --trace-children=yes --leak-check=full --undef-value-errors=no --show-error-list=yes ./allocator_aligned_test --arraysize 5000000 --passes 200) set_tests_properties(allocator_aligned_memcheck.valgrind PROPERTIES PASS_REGULAR_EXPRESSION "ERROR SUMMARY: 0 errors from 0 contexts" ) From 73fc898edcd18e25eef43caf945cbee841e3f633 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= <Gregor.Daiss+git@gmail.com> Date: Tue, 22 Aug 2023 16:15:10 -0500 Subject: [PATCH 34/42] Fix version string --- CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1fbb9fda..cbe132be 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -12,8 +12,8 @@ set(CMAKE_CXX_STANDARD 17) # Version set(CPPUDDLE_VERSION_MAJOR 0) -set(CPPUDDLE_VERSION_MINOR 1) -set(CPPUDDLE_VERSION_PATCH 99) +set(CPPUDDLE_VERSION_MINOR 3) +set(CPPUDDLE_VERSION_PATCH 0) set(CPPUDDLE_VERSION_STRING "${CPPUDDLE_VERSION_MAJOR}.${CPPUDDLE_VERSION_MINOR}.${CPPUDDLE_VERSION_PATCH}.") #------------------------------------------------------------------------------------------------------------ From 69126c5b53eb38fd90b60c7affe80a73ba199406 Mon Sep 17 00:00:00 2001 From: Gregor Daiss <Gregor.Daiss+git@gmail.com> Date: Wed, 23 Aug 2023 00:59:36 -0500 Subject: [PATCH 35/42] Fix stream test --- tests/stream_test.hpp | 64 +++++++++++++++++++++---------------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/tests/stream_test.hpp b/tests/stream_test.hpp index 716a8ba7..07de4c44 100644 --- a/tests/stream_test.hpp +++ b/tests/stream_test.hpp @@ -20,7 +20,7 @@ void test_pool_memcpy(const size_t stream_parameter, Ts &&... ts) { stream_pool::init<Interface, Pool>(stream_parameter, std::forward<Ts>(ts)...); // without interface wrapper { - auto test1 = stream_pool::get_interface<Interface, Pool>(); + auto test1 = stream_pool::get_interface<Interface, Pool>(0); Interface test1_interface = std::get<0>(test1); size_t interface_id = std::get<1>(test1); hpx::apply(test1_interface, cudaMemcpyAsync, devicebuffer.device_side_buffer, @@ -30,12 +30,12 @@ void test_pool_memcpy(const size_t stream_parameter, Ts &&... ts) { cudaMemcpyAsync, hostbuffer.data(), devicebuffer.device_side_buffer, 512 * sizeof(double), cudaMemcpyDeviceToHost); fut1.get(); - stream_pool::release_interface<Interface, Pool>(interface_id); + stream_pool::release_interface<Interface, Pool>(interface_id, 0); } // with interface wrapper { - stream_interface<Interface, Pool> test1_interface; + stream_interface<Interface, Pool> test1_interface{0}; // hpx::cuda::cuda_executor test1_interface(0, false); hpx::apply(test1_interface.interface, cudaMemcpyAsync, devicebuffer.device_side_buffer, hostbuffer.data(), 512 * sizeof(double), @@ -55,43 +55,43 @@ void test_pool_ref_counting(const size_t stream_parameter, Ts &&... ts) { stream_pool::init<Interface, Pool>(stream_parameter, std::forward<Ts>(ts)...); { // Allocating - auto test1 = stream_pool::get_interface<Interface, Pool>(); - auto load1 = stream_pool::get_current_load<Interface, Pool>(); + auto test1 = stream_pool::get_interface<Interface, Pool>(0); + auto load1 = stream_pool::get_current_load<Interface, Pool>(0); assert(load1 == 0); Interface test1_interface = std::get<0>(test1); size_t test1_index = std::get<1>(test1); - auto test2 = stream_pool::get_interface<Interface, Pool>(); - auto load2 = stream_pool::get_current_load<Interface, Pool>(); + auto test2 = stream_pool::get_interface<Interface, Pool>(0); + auto load2 = stream_pool::get_current_load<Interface, Pool>(0); assert(load2 == 1); Interface test2_interface = std::get<0>(test2); // auto fut = test2_interface.get_future(); size_t test2_index = std::get<1>(test2); - auto test3 = stream_pool::get_interface<Interface, Pool>(); - auto load3 = stream_pool::get_current_load<Interface, Pool>(); + auto test3 = stream_pool::get_interface<Interface, Pool>(0); + auto load3 = stream_pool::get_current_load<Interface, Pool>(0); assert(load3 == 1); Interface test3_interface = std::get<0>(test3); size_t test3_index = std::get<1>(test3); - auto test4 = stream_pool::get_interface<Interface, Pool>(); - auto load4 = stream_pool::get_current_load<Interface, Pool>(); + auto test4 = stream_pool::get_interface<Interface, Pool>(0); + auto load4 = stream_pool::get_current_load<Interface, Pool>(0); Interface test4_interface = std::get<0>(test4); size_t test4_index = std::get<1>(test4); assert(load4 == 2); // Releasing - stream_pool::release_interface<Interface, Pool>(test4_index); - load4 = stream_pool::get_current_load<Interface, Pool>(); + stream_pool::release_interface<Interface, Pool>(test4_index, 0); + load4 = stream_pool::get_current_load<Interface, Pool>(0); assert(load4 == 1); - stream_pool::release_interface<Interface, Pool>(test3_index); - load3 = stream_pool::get_current_load<Interface, Pool>(); + stream_pool::release_interface<Interface, Pool>(test3_index, 0); + load3 = stream_pool::get_current_load<Interface, Pool>(0); assert(load3 == 1); - stream_pool::release_interface<Interface, Pool>(test2_index); - load2 = stream_pool::get_current_load<Interface, Pool>(); + stream_pool::release_interface<Interface, Pool>(test2_index, 0); + load2 = stream_pool::get_current_load<Interface, Pool>(0); assert(load2 == 0); - stream_pool::release_interface<Interface, Pool>(test1_index); - load1 = stream_pool::get_current_load<Interface, Pool>(); + stream_pool::release_interface<Interface, Pool>(test1_index, 0); + load1 = stream_pool::get_current_load<Interface, Pool>(0); assert(load1 == 0); } // Clear - auto load0 = stream_pool::get_current_load<Interface, Pool>(); + auto load0 = stream_pool::get_current_load<Interface, Pool>(0); assert(load0 == 0); stream_pool::cleanup<Interface, Pool>(); } @@ -102,28 +102,28 @@ void test_pool_wrappers(const size_t stream_parameter, Ts &&... ts) { // init ppol stream_pool::init<Interface, Pool>(stream_parameter, std::forward<Ts>(ts)...); { - wrapper_type test1; - auto load = stream_pool::get_current_load<Interface, Pool>(); + wrapper_type test1{0}; + auto load = stream_pool::get_current_load<Interface, Pool>(0); assert(load == 0); - wrapper_type test2; - load = stream_pool::get_current_load<Interface, Pool>(); + wrapper_type test2{0}; + load = stream_pool::get_current_load<Interface, Pool>(0); // auto fut = test2.get_future(); assert(load == 1); - wrapper_type test3; - load = stream_pool::get_current_load<Interface, Pool>(); + wrapper_type test3{0}; + load = stream_pool::get_current_load<Interface, Pool>(0); assert(load == 1); - wrapper_type test4; - load = stream_pool::get_current_load<Interface, Pool>(); + wrapper_type test4{0}; + load = stream_pool::get_current_load<Interface, Pool>(0); assert(load == 2); // Check availability method: - bool avail = stream_pool::interface_available<Interface, Pool>(1); + bool avail = stream_pool::interface_available<Interface, Pool>(1, 0); assert(avail == false); // NOLINT - avail = stream_pool::interface_available<Interface, Pool>(2); + avail = stream_pool::interface_available<Interface, Pool>(2, 0); assert(avail == false); // NOLINT - avail = stream_pool::interface_available<Interface, Pool>(3); + avail = stream_pool::interface_available<Interface, Pool>(3, 0); assert(avail == true); // NOLINT } - auto load0 = stream_pool::get_current_load<Interface, Pool>(); + auto load0 = stream_pool::get_current_load<Interface, Pool>(0); assert(load0 == 0); stream_pool::cleanup<Interface, Pool>(); } From de53e04fef277cd1c57aa358f0f4571c930c2caa Mon Sep 17 00:00:00 2001 From: Gregor Daiss <Gregor.Daiss+git@gmail.com> Date: Wed, 23 Aug 2023 00:59:52 -0500 Subject: [PATCH 36/42] Fix kokkos test --- tests/allocator_kokkos_test.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/allocator_kokkos_test.cpp b/tests/allocator_kokkos_test.cpp index 055fa397..2222cac3 100644 --- a/tests/allocator_kokkos_test.cpp +++ b/tests/allocator_kokkos_test.cpp @@ -81,7 +81,7 @@ int main(int argc, char *argv[]) { for (size_t pass = 0; pass < passes; pass++) { test_view my_wrapper_test1(1000); test_view my_wrapper_test2(1000); - test_view my_wrapper_test3(number_instances - 1, 1000); // test 1D with location id + test_view my_wrapper_test3(recycler::number_instances - 1, 1000); // test 1D with location id double t = 2.6; Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::Serial>(0, 1000), KOKKOS_LAMBDA(const int n) { From ced646f56f42c5dbf08a440bf3522b5bca4c0d89 Mon Sep 17 00:00:00 2001 From: Gregor Daiss <Gregor.Daiss+git@gmail.com> Date: Wed, 23 Aug 2023 01:08:30 -0500 Subject: [PATCH 37/42] Fix aggregatoin test --- include/aggregation_manager.hpp | 4 ++-- include/detail/config.hpp | 2 +- tests/work_aggregation_cuda_triad.cpp | 7 +++---- tests/work_aggregation_test.cpp | 2 +- 4 files changed, 7 insertions(+), 8 deletions(-) diff --git a/include/aggregation_manager.hpp b/include/aggregation_manager.hpp index c297056c..92ad5f8d 100644 --- a/include/aggregation_manager.hpp +++ b/include/aggregation_manager.hpp @@ -777,7 +777,7 @@ template <typename Executor> class Aggregated_Executor { #ifndef NDEBUG for (const auto &buffer_entry : buffer_allocations) { const auto &[buffer_pointer_any, buffer_size, - buffer_allocation_counter, valid, location_id] = + buffer_allocation_counter, valid, location_id, device_id] = buffer_entry; assert(!valid); } @@ -908,7 +908,7 @@ template <typename Executor> class Aggregated_Executor { #ifndef NDEBUG for (const auto &buffer_entry : buffer_allocations) { const auto &[buffer_pointer_any, buffer_size, buffer_allocation_counter, - valid, location_id] = buffer_entry; + valid, location_id, device_id] = buffer_entry; assert(!valid); } #endif diff --git a/include/detail/config.hpp b/include/detail/config.hpp index b5816cf3..1764ffff 100644 --- a/include/detail/config.hpp +++ b/include/detail/config.hpp @@ -62,7 +62,7 @@ static_assert(max_number_gpus > 0); /// Uses HPX thread information to determine which GPU should be used inline size_t get_device_id(const size_t number_gpus) { #if defined(CPPUDDLE_HAVE_HPX) - assert(number_gpus < max_number_gpus); + assert(number_gpus <= max_number_gpus); return hpx::get_worker_thread_num() % number_gpus; #else return 0; diff --git a/tests/work_aggregation_cuda_triad.cpp b/tests/work_aggregation_cuda_triad.cpp index 2ded567a..f3f6ec92 100644 --- a/tests/work_aggregation_cuda_triad.cpp +++ b/tests/work_aggregation_cuda_triad.cpp @@ -28,7 +28,6 @@ __global__ void __launch_bounds__(1024, 2) triad_kernel(float_t *A, const float_ //=============================================================================== //=============================================================================== int hpx_main(int argc, char *argv[]) { - static_assert(max_number_gpus == 1, "This test currently does not support MultiGPU builds!"); // Init parameters size_t problem_size{0}; size_t kernel_size{0}; @@ -209,7 +208,7 @@ int hpx_main(int argc, char *argv[]) { recycler::cuda_aggregated_device_buffer<float_t, decltype(alloc_device)> - device_A(slice_exec.number_slices * kernel_size, 0, + device_A(slice_exec.number_slices * kernel_size, alloc_device); std::vector<float_t, decltype(alloc_host)> local_B( @@ -217,7 +216,7 @@ int hpx_main(int argc, char *argv[]) { alloc_host); recycler::cuda_aggregated_device_buffer<float_t, decltype(alloc_device)> - device_B(slice_exec.number_slices * kernel_size, 0, + device_B(slice_exec.number_slices * kernel_size, alloc_device); std::vector<float_t, decltype(alloc_host)> local_C( @@ -225,7 +224,7 @@ int hpx_main(int argc, char *argv[]) { alloc_host); recycler::cuda_aggregated_device_buffer<float_t, decltype(alloc_device)> - device_C(slice_exec.number_slices * kernel_size, 0, + device_C(slice_exec.number_slices * kernel_size, alloc_device); for (size_t i = task_id * kernel_size, j = 0; diff --git a/tests/work_aggregation_test.cpp b/tests/work_aggregation_test.cpp index 28a37806..25455633 100644 --- a/tests/work_aggregation_test.cpp +++ b/tests/work_aggregation_test.cpp @@ -605,7 +605,7 @@ void references_add_test(void) { auto &agg_exec = std::get<0>(stream_pool::get_interface< Aggregated_Executor<Dummy_Executor>, - round_robin_pool<Aggregated_Executor<Dummy_Executor>>>()); + round_robin_pool<Aggregated_Executor<Dummy_Executor>>>(0)); std::vector<float> erg(512); std::vector<hpx::lcos::future<void>> slices_done_futs; From 10864aa2583540b0c2ce2c2d8cdb1cdf0302e174 Mon Sep 17 00:00:00 2001 From: Gregor Daiss <Gregor.Daiss+git@gmail.com> Date: Wed, 23 Aug 2023 01:12:58 -0500 Subject: [PATCH 38/42] Fix kokkos regex --- CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5ff430d4..0dd00398 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -561,12 +561,12 @@ if (CPPUDDLE_WITH_TESTS) add_test(allocator_kokkos_test.analyse_cleaned_buffers cat allocator_kokkos_test.out) set_tests_properties(allocator_kokkos_test.analyse_cleaned_buffers PROPERTIES FIXTURES_REQUIRED allocator_kokkos_output - PASS_REGULAR_EXPRESSION "--> Number cleaned up buffers:[ ]* 2" + PASS_REGULAR_EXPRESSION "--> Number cleaned up buffers:[ ]* 3" ) add_test(allocator_kokkos_test.analyse_created_buffers cat allocator_kokkos_test.out) set_tests_properties(allocator_kokkos_test.analyse_created_buffers PROPERTIES FIXTURES_REQUIRED allocator_kokkos_output - PASS_REGULAR_EXPRESSION "--> Number of times a new buffer had to be created for a request:[ ]* 2" + PASS_REGULAR_EXPRESSION "--> Number of times a new buffer had to be created for a request:[ ]* 3" ) add_test(allocator_kokkos_test.analyse_bad_allocs cat allocator_kokkos_test.out) set_tests_properties(allocator_kokkos_test.analyse_bad_allocs PROPERTIES From b012fd73f9f428aecab5b42d8624b1af3912ff24 Mon Sep 17 00:00:00 2001 From: Gregor Daiss <Gregor.Daiss+git@gmail.com> Date: Wed, 23 Aug 2023 12:31:13 -0500 Subject: [PATCH 39/42] Rename deactivate cmake flags --- CMakeLists.txt | 18 ++++++++++-------- include/buffer_manager.hpp | 4 ++-- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5e5198c0..1981b66e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -34,8 +34,8 @@ set(CPPUDDLE_WITH_NUMBER_BUCKETS "128" CACHE STRING "Number of internal recycle option(CPPUDDLE_WITH_COUNTERS "Turns on allocations counters. Useful for extended testing" OFF) option(CPPUDDLE_WITH_TESTS "Build tests/examples" OFF) set(CPPUDDLE_WITH_DEADLOCK_TEST_REPETITONS "100000" CACHE STRING "Number of repetitions for the aggregation executor deadlock tests") -option(CPPUDDLE_DEACTIVATE_BUFFER_RECYCLING "Deactivates the default recycling behaviour" OFF) -option(CPPUDDLE_DEACTIVATE_AGGRESSIVE_ALLOCATORS "Deactivates the aggressive allocators" OFF) +option(CPPUDDLE_WITH_BUFFER_RECYCLING "Enables the default recycling behaviour! Turning this off will have a major negative performance impact and is only intended for testing!" ON) +option(CPPUDDLE_WITH_AGGRESSIVE_CONTENT_RECYCLING "Allows the aggressive allocators variants to reuse contents from previous buffers (and thus skip initializations)" ON) # Tooling options option(CPPUDDLE_WITH_CLANG_TIDY "Enable clang tidy warnings" OFF) option(CPPUDDLE_WITH_CLANG_FORMAT "Enable clang format target" OFF) @@ -213,16 +213,18 @@ else() message(INFO " Compiling with std::mutex!") endif() -if(CPPUDDLE_DEACTIVATE_BUFFER_RECYCLING) - target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_DEACTIVATE_BUFFER_RECYCLING") - message(WARNING " Slow Build: Buffer recycling is deactivated. This should only be used for performance tests!") -else() +if(CPPUDDLE_WITH_BUFFER_RECYCLING) message(INFO " Using default buffer recycling behaviour!") +else() + message(WARNING " Slow Build: Buffer recycling is deactivated. This should only be used for performance tests!") + target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_DEACTIVATE_BUFFER_RECYCLING") endif() -if(CPPUDDLE_DEACTIVATE_AGGRESSIVE_ALLOCATORS) +if(CPPUDDLE_WITH_AGGRESSIVE_CONTENT_RECYCLING) + message(INFO " Using default behaviour for aggressive content reusage (only relevant for aggressive allocators)!") +else() target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_DEACTIVATE_AGGRESSIVE_ALLOCATORS") - message(WARNING " Slow Build: Aggressive allocators disabled. This should only be used for performance tests!") + message(WARNING " Slow Build: Aggressive allocators (and thus content recycling) is disabled. This should only be used for performance tests!") endif() # install libs with the defitions: diff --git a/include/buffer_manager.hpp b/include/buffer_manager.hpp index 32e751d9..92a5f46b 100644 --- a/include/buffer_manager.hpp +++ b/include/buffer_manager.hpp @@ -70,7 +70,7 @@ class buffer_recycler { // Warn about suboptimal performance without recycling #pragma message \ "Warning: Building without buffer recycling! Use only for performance testing! \ -For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCLING=OFF!" +For better performance configure CPPuddle with CPPUDDLE_WITH_BUFFER_RECYCLING=ON!" template <typename T, typename Host_Allocator> static T *get(size_t number_elements, bool manage_content_lifetime = false, @@ -887,7 +887,7 @@ struct aggressive_recycle_allocator { // Warn about suboptimal performance without recycling #pragma message \ "Warning: Building without content reusage for aggressive allocators! \ -For better performance configure with CPPUDDLE_DEACTIVATE_AGGRESSIVE_ALLOCATORS=OFF !" +For better performance configure with CPPUDDLE_WITH_AGGRESSIVE_CONTENT_RECYCLING=ON !" template <typename... Args> inline void construct(T *p, Args... args) noexcept { ::new (static_cast<void *>(p)) T(std::forward<Args>(args)...); From 65c8c9cd67b665b902175fe2bdf0f8178a0a4465 Mon Sep 17 00:00:00 2001 From: Gregor Daiss <Gregor.Daiss+git@gmail.com> Date: Wed, 23 Aug 2023 15:54:55 -0500 Subject: [PATCH 40/42] Fix asserts and single gpu tests for multigpu builds --- include/stream_manager.hpp | 13 ++++++++----- tests/allocator_kokkos_test.cpp | 2 +- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/include/stream_manager.hpp b/include/stream_manager.hpp index 4174af5c..40631491 100644 --- a/include/stream_manager.hpp +++ b/include/stream_manager.hpp @@ -179,6 +179,7 @@ class stream_pool { /* "deprecated stream_pool::init does not support multigpu"); */ auto guard = make_scoped_lock_from_array(instance().gpu_mutexes); instance().streampools.emplace_back(number_of_streams, executor_args...); + assert(instance().streampools.size() <= recycler::max_number_gpus); } /// Multi-GPU init where executors / interfaces on all GPUs are initialized with the same arguments @@ -192,6 +193,7 @@ class stream_pool { executor_args...); } } + assert(instance().streampools.size() <= recycler::max_number_gpus); } /// Per-GPU init allowing for different init parameters depending on the GPU @@ -204,6 +206,7 @@ class stream_pool { instance().streampools.emplace_back(number_of_streams, executor_args...); } + assert(instance().streampools.size() <= recycler::max_number_gpus); } // TODO add/rename into finalize? @@ -215,22 +218,22 @@ class stream_pool { static std::tuple<Interface &, size_t> get_interface(const size_t gpu_id = 0) { std::lock_guard<recycler::mutex_t> guard(instance().gpu_mutexes[gpu_id]); - assert(instance().streampools.size() == recycler::max_number_gpus); + assert(gpu_id < instance().streampools.size()); return instance().streampools[gpu_id].get_interface(); } static void release_interface(size_t index, const size_t gpu_id = 0) { std::lock_guard<recycler::mutex_t> guard(instance().gpu_mutexes[gpu_id]); - assert(instance().streampools.size() == recycler::max_number_gpus); + assert(gpu_id < instance().streampools.size()); instance().streampools[gpu_id].release_interface(index); } static bool interface_available(size_t load_limit, const size_t gpu_id = 0) { std::lock_guard<recycler::mutex_t> guard(instance().gpu_mutexes[gpu_id]); - assert(instance().streampools.size() == recycler::max_number_gpus); + assert(gpu_id < instance().streampools.size()); return instance().streampools[gpu_id].interface_available(load_limit); } static size_t get_current_load(const size_t gpu_id = 0) { std::lock_guard<recycler::mutex_t> guard(instance().gpu_mutexes[gpu_id]); - assert(instance().streampools.size() == recycler::max_number_gpus); + assert(gpu_id < instance().streampools.size()); return instance().streampools[gpu_id].get_current_load(); } // TODO deprecated! Remove... @@ -255,7 +258,7 @@ class stream_pool { recycler::mutex_t pool_mut{}; std::function<void(size_t)> select_gpu_function = [](size_t gpu_id) { // By default no multi gpu support - assert(recycler::max_number_gpus == 1); + assert(recycler::max_number_gpus == 1 || instance().streampools.size() == 1); assert(gpu_id == 0); }; diff --git a/tests/allocator_kokkos_test.cpp b/tests/allocator_kokkos_test.cpp index 2222cac3..e2770458 100644 --- a/tests/allocator_kokkos_test.cpp +++ b/tests/allocator_kokkos_test.cpp @@ -81,7 +81,7 @@ int main(int argc, char *argv[]) { for (size_t pass = 0; pass < passes; pass++) { test_view my_wrapper_test1(1000); test_view my_wrapper_test2(1000); - test_view my_wrapper_test3(recycler::number_instances - 1, 1000); // test 1D with location id + test_view my_wrapper_test3(0, 1000); // test 1D with explicit device id parameter double t = 2.6; Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::Serial>(0, 1000), KOKKOS_LAMBDA(const int n) { From 6bb7fd01819b43a001420528dd78062eaa5decb2 Mon Sep 17 00:00:00 2001 From: Gregor Daiss <Gregor.Daiss+git@gmail.com> Date: Wed, 23 Aug 2023 23:53:43 -0500 Subject: [PATCH 41/42] Check if recycling is on before running tests --- CMakeLists.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1981b66e..64690fc5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -245,6 +245,9 @@ install(EXPORT CPPuddle NAMESPACE CPPuddle:: DESTINATION ${CMAKE_INSTALL_PREFIX} ## Add target for tests and tests definitions if (CPPUDDLE_WITH_TESTS) + if(NOT CPPUDDLE_WITH_BUFFER_RECYCLING) + message(FATAL_ERROR "The CPPuddle tests only work with CPPUDDLE_WITH_BUFFER_RECYCLING=ON. Turning off buffer recycling is not recommended in general!") + endif() add_executable(allocator_test tests/allocator_test.cpp) if (CPPUDDLE_WITH_HPX) target_link_libraries(allocator_test From 83f99bbf1e8a6be586c3041035d304a6bb1eeffa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= <Gregor.Daiss+git@gmail.com> Date: Thu, 24 Aug 2023 10:58:47 -0500 Subject: [PATCH 42/42] Fix performance test --- CMakeLists.txt | 21 +++------------------ tests/allocator_aligned_test.cpp | 4 ++-- tests/allocator_hpx_test.cpp | 4 ++-- tests/allocator_test.cpp | 4 ++-- 4 files changed, 9 insertions(+), 24 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1981b66e..35953162 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -395,15 +395,10 @@ if (CPPUDDLE_WITH_TESTS) ) endif() if (NOT CMAKE_BUILD_TYPE MATCHES "Debug") # Performance tests only make sense with optimizations on - add_test(allocator_test.performance.analyse_recycle_performance cat allocator_test.out) - set_tests_properties(allocator_test.performance.analyse_recycle_performance PROPERTIES - FIXTURES_REQUIRED allocator_test_output - PASS_REGULAR_EXPRESSION "Test information: Recycler was faster than default allocator!" - ) add_test(allocator_test.performance.analyse_aggressive_performance cat allocator_test.out) set_tests_properties(allocator_test.performance.analyse_aggressive_performance PROPERTIES FIXTURES_REQUIRED allocator_test_output - PASS_REGULAR_EXPRESSION "Test information: Recycler was faster than default allocator!" + PASS_REGULAR_EXPRESSION "Test information: Aggressive recycler was faster than default allocator!" ) endif() add_test(allocator_test.fixture_cleanup ${CMAKE_COMMAND} -E remove allocator_test.out) @@ -462,15 +457,10 @@ if (CPPUDDLE_WITH_TESTS) ) endif() if (NOT CMAKE_BUILD_TYPE MATCHES "Debug") # Performance tests only make sense with optimizations on - add_test(allocator_aligned_test.performance.analyse_recycle_performance cat allocator_aligned_test.out) - set_tests_properties(allocator_aligned_test.performance.analyse_recycle_performance PROPERTIES - FIXTURES_REQUIRED allocator_aligned_test_output - PASS_REGULAR_EXPRESSION "Test information: Recycler was faster than default allocator!" - ) add_test(allocator_aligned_test.performance.analyse_aggressive_performance cat allocator_aligned_test.out) set_tests_properties(allocator_aligned_test.performance.analyse_aggressive_performance PROPERTIES FIXTURES_REQUIRED allocator_aligned_test_output - PASS_REGULAR_EXPRESSION "Test information: Recycler was faster than default allocator!" + PASS_REGULAR_EXPRESSION "Test information: Aggressive recycler was faster than default allocator!" ) endif() add_test(allocator_aligned_test.fixture_cleanup ${CMAKE_COMMAND} -E remove allocator_aligned_test.out) @@ -518,15 +508,10 @@ if (CPPUDDLE_WITH_TESTS) ) endif() if (NOT CMAKE_BUILD_TYPE MATCHES "Debug") # Performance tests only make sense with optimizations on - add_test(allocator_concurrency_test.performance.analyse_recycle_performance cat allocator_concurrency_test.out) - set_tests_properties(allocator_concurrency_test.performance.analyse_recycle_performance PROPERTIES - FIXTURES_REQUIRED allocator_concurrency_output - PASS_REGULAR_EXPRESSION "Test information: Recycler was faster than default allocator!" - ) add_test(allocator_concurrency_test.performance.analyse_aggressive_performance cat allocator_concurrency_test.out) set_tests_properties(allocator_concurrency_test.performance.analyse_aggressive_performance PROPERTIES FIXTURES_REQUIRED allocator_concurrency_output - PASS_REGULAR_EXPRESSION "Test information: Recycler was faster than default allocator!" + PASS_REGULAR_EXPRESSION "Test information: Aggressive recycler was faster than default allocator!" ) endif() add_test(allocator_concurrency_test.fixture_cleanup ${CMAKE_COMMAND} -E remove allocator_concurrency_test.out) diff --git a/tests/allocator_aligned_test.cpp b/tests/allocator_aligned_test.cpp index 9178dfbf..c3c09217 100644 --- a/tests/allocator_aligned_test.cpp +++ b/tests/allocator_aligned_test.cpp @@ -142,8 +142,8 @@ int main(int argc, char *argv[]) { "recycler!" << std::endl; } - if (recycle_duration < default_duration) { - std::cout << "Test information: Recycler was faster than default allocator!" + if (aggressive_duration < default_duration) { + std::cout << "Test information: Aggressive recycler was faster than default allocator!" << std::endl; } recycler::print_performance_counters(); diff --git a/tests/allocator_hpx_test.cpp b/tests/allocator_hpx_test.cpp index 6ddb70ad..9d8cc44b 100644 --- a/tests/allocator_hpx_test.cpp +++ b/tests/allocator_hpx_test.cpp @@ -177,8 +177,8 @@ int hpx_main(int argc, char *argv[]) { "recycler!" << std::endl; } - if (recycle_duration < default_duration) { - std::cout << "Test information: Recycler was faster than default allocator!" + if (aggressive_duration < default_duration) { + std::cout << "Test information: Aggressive recycler was faster than default allocator!" << std::endl; } } diff --git a/tests/allocator_test.cpp b/tests/allocator_test.cpp index 0d8868db..004368a4 100644 --- a/tests/allocator_test.cpp +++ b/tests/allocator_test.cpp @@ -134,8 +134,8 @@ int main(int argc, char *argv[]) { "recycler!" << std::endl; } - if (recycle_duration < default_duration) { - std::cout << "Test information: Recycler was faster than default allocator!" + if (aggressive_duration < default_duration) { + std::cout << "Test information: Aggressive recycler was faster than default allocator!" << std::endl; } recycler::print_performance_counters();