From f4aac16d42706f989424aa86b8f08f9a3e074681 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= Date: Tue, 3 Oct 2023 13:00:50 -0500 Subject: [PATCH 1/4] Prototype --- include/stream_manager.hpp | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/include/stream_manager.hpp b/include/stream_manager.hpp index 34c05d19..7799c8c3 100644 --- a/include/stream_manager.hpp +++ b/include/stream_manager.hpp @@ -18,6 +18,13 @@ #include #include "../include/detail/config.hpp" +#include + +#ifndef KOKKOS_ENABLE_SERIAL +namespace hpx { namespace kokkos { +enum class execution_space_mode { global, independent }; +}} +#endif /// Turns a std::array_mutex into an scoped lock template @@ -293,16 +300,20 @@ class stream_pool { template class stream_interface { public: - explicit stream_interface(size_t gpu_id) - : t(stream_pool::get_interface(gpu_id)), - interface(std::get<0>(t)), interface_index(std::get<1>(t)), gpu_id(gpu_id) {} + + template + explicit stream_interface(size_t gpu_id, + std::enable_if_t::value, int> = 0) + : gpu_id(gpu_id), interface(gpu_id) {} + template + explicit stream_interface(std::enable_if_t::value, int> = 0) + : gpu_id(gpu_id), interface(hpx::kokkos::execution_space_mode::independent) {} stream_interface(const stream_interface &other) = delete; stream_interface &operator=(const stream_interface &other) = delete; stream_interface(stream_interface &&other) = delete; stream_interface &operator=(stream_interface &&other) = delete; ~stream_interface() { - stream_pool::release_interface(interface_index, gpu_id); } template @@ -325,12 +336,10 @@ template class stream_interface { } private: - std::tuple t; - size_t interface_index; size_t gpu_id; public: - Interface &interface; + Interface interface; }; #endif From d016e65cd7662ff1697b9f5f4e1a205ce0562d7c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= Date: Tue, 3 Oct 2023 20:11:59 -0500 Subject: [PATCH 2/4] Add option to disable executor pool --- CMakeLists.txt | 8 ++++++ include/stream_manager.hpp | 56 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 46685075..5a748b16 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -36,6 +36,7 @@ option(CPPUDDLE_WITH_TESTS "Build tests/examples" OFF) set(CPPUDDLE_WITH_DEADLOCK_TEST_REPETITONS "100000" CACHE STRING "Number of repetitions for the aggregation executor deadlock tests") option(CPPUDDLE_WITH_BUFFER_RECYCLING "Enables the default recycling behaviour! Turning this off will have a major negative performance impact and is only intended for testing!" ON) option(CPPUDDLE_WITH_AGGRESSIVE_CONTENT_RECYCLING "Allows the aggressive allocators variants to reuse contents from previous buffers (and thus skip initializations)" ON) +option(CPPUDDLE_WITH_EXECUTOR_RECYCLING "Enables the default executor recycling behaviour! Turning this off will have a major negative performance impact and is only intended for testing!" ON) # Tooling options option(CPPUDDLE_WITH_CLANG_TIDY "Enable clang tidy warnings" OFF) option(CPPUDDLE_WITH_CLANG_FORMAT "Enable clang format target" OFF) @@ -227,6 +228,13 @@ else() message(WARNING " Slow Build: Aggressive allocators (and thus content recycling) is disabled. This should only be used for performance tests!") endif() +if(CPPUDDLE_WITH_EXECUTOR_RECYCLING) + message(INFO " Using default executor recycling behaviour!") +else() + message(WARNING " Slow Build: executor recycling is deactivated. This should only be used for performance tests!") + target_compile_definitions(stream_manager INTERFACE "CPPUDDLE_DEACTIVATE_EXECUTOR_RECYCLING") +endif() + # install libs with the defitions: install(TARGETS buffer_manager EXPORT CPPuddle ) diff --git a/include/stream_manager.hpp b/include/stream_manager.hpp index 7799c8c3..896a7d73 100644 --- a/include/stream_manager.hpp +++ b/include/stream_manager.hpp @@ -298,6 +298,16 @@ class stream_pool { stream_pool &operator=(stream_pool &&other) = delete; }; +#if defined(CPPUDDLE_DEACTIVATE_EXECUTOR_RECYCLING) + +// Warn about suboptimal performance without recycling +#pragma message \ +"Warning: Building without executor recycling! Use only for performance testing! \ +For better performance configure CPPuddle with CPPUDDLE_WITH_EXECUTOR_RECYCLING=ON!" + +/// Slow version of the stream_interface that does not draw its +/// executors (Interface) from the pool but creates them instead. +/// Only meant for performance comparisons and only works with cuda/kokkos executors template class stream_interface { public: @@ -341,5 +351,51 @@ template class stream_interface { public: Interface interface; }; +#else +/// Stream interface for RAII purposes +/// Draws executor from the stream pool and releases it upon +/// destruction +template class stream_interface { +public: + explicit stream_interface(size_t gpu_id) + : t(stream_pool::get_interface(gpu_id)), + interface(std::get<0>(t)), interface_index(std::get<1>(t)), gpu_id(gpu_id) {} + + stream_interface(const stream_interface &other) = delete; + stream_interface &operator=(const stream_interface &other) = delete; + stream_interface(stream_interface &&other) = delete; + stream_interface &operator=(stream_interface &&other) = delete; + ~stream_interface() { + stream_pool::release_interface(interface_index, gpu_id); + } + + template + inline decltype(auto) post(F &&f, Ts &&... ts) { + return interface.post(std::forward(f), std::forward(ts)...); + } + + template + inline decltype(auto) async_execute(F &&f, Ts &&... ts) { + return interface.async_execute(std::forward(f), std::forward(ts)...); + } + + inline decltype(auto) get_future() { + return interface.get_future(); + } + + // allow implict conversion + operator Interface &() { // NOLINT + return interface; + } + +private: + std::tuple t; + size_t interface_index; + size_t gpu_id; + +public: + Interface &interface; +}; +#endif #endif From 9958a54d1861092182717d0f26945b07f036de6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= Date: Wed, 4 Oct 2023 01:17:32 -0500 Subject: [PATCH 3/4] Fix defines --- include/stream_manager.hpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/stream_manager.hpp b/include/stream_manager.hpp index 896a7d73..c1043c9a 100644 --- a/include/stream_manager.hpp +++ b/include/stream_manager.hpp @@ -20,11 +20,16 @@ #include "../include/detail/config.hpp" #include +// Redefintion required for non-recycling executors +// Without it, default constructing the executors (independent) would not work +#if defined(CPPUDDLE_DEACTIVATE_EXECUTOR_RECYCLING) +// Do only define if Kokkos is not found #ifndef KOKKOS_ENABLE_SERIAL namespace hpx { namespace kokkos { enum class execution_space_mode { global, independent }; }} #endif +#endif /// Turns a std::array_mutex into an scoped lock template From 9516f32cf23874380738446808d4c6fa7d185ff9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= Date: Thu, 5 Oct 2023 17:41:37 -0500 Subject: [PATCH 4/4] Fix include guards --- CMakeLists.txt | 5 ++++- include/stream_manager.hpp | 8 ++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5a748b16..630f12ed 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -219,19 +219,22 @@ if(CPPUDDLE_WITH_BUFFER_RECYCLING) else() message(WARNING " Slow Build: Buffer recycling is deactivated. This should only be used for performance tests!") target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_DEACTIVATE_BUFFER_RECYCLING") + target_compile_definitions(stream_manager INTERFACE "CPPUDDLE_DEACTIVATE_BUFFER_RECYCLING") endif() if(CPPUDDLE_WITH_AGGRESSIVE_CONTENT_RECYCLING) message(INFO " Using default behaviour for aggressive content reusage (only relevant for aggressive allocators)!") else() - target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_DEACTIVATE_AGGRESSIVE_ALLOCATORS") message(WARNING " Slow Build: Aggressive allocators (and thus content recycling) is disabled. This should only be used for performance tests!") + target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_DEACTIVATE_AGGRESSIVE_ALLOCATORS") + target_compile_definitions(stream_manager INTERFACE "CPPUDDLE_DEACTIVATE_AGGRESSIVE_ALLOCATORS") endif() if(CPPUDDLE_WITH_EXECUTOR_RECYCLING) message(INFO " Using default executor recycling behaviour!") else() message(WARNING " Slow Build: executor recycling is deactivated. This should only be used for performance tests!") + target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_DEACTIVATE_EXECUTOR_RECYCLING") target_compile_definitions(stream_manager INTERFACE "CPPUDDLE_DEACTIVATE_EXECUTOR_RECYCLING") endif() diff --git a/include/stream_manager.hpp b/include/stream_manager.hpp index c1043c9a..e9ae375c 100644 --- a/include/stream_manager.hpp +++ b/include/stream_manager.hpp @@ -18,7 +18,15 @@ #include #include "../include/detail/config.hpp" + +// Need to cuda/hip definitions for default params when NOT +// drawing from an executor pool +#if defined(CPPUDDLE_DEACTIVATE_EXECUTOR_RECYCLING) +#include +#if defined(HPX_HAVE_CUDA) || defined(HPX_HAVE_HIP) #include +#endif +#endif // Redefintion required for non-recycling executors // Without it, default constructing the executors (independent) would not work