From 584013bd721c07ef740b778fe868299ecc7fa5f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= Date: Thu, 16 Mar 2023 22:48:50 +0100 Subject: [PATCH 01/46] Change singleton method and remove reference counting --- include/buffer_manager.hpp | 349 +++++------------------------ include/kokkos_buffer_util.hpp | 30 ++- src/buffer_manager_definitions.cpp | 8 +- 3 files changed, 84 insertions(+), 303 deletions(-) diff --git a/include/buffer_manager.hpp b/include/buffer_manager.hpp index f2c19b8f..ce96a913 100644 --- a/include/buffer_manager.hpp +++ b/include/buffer_manager.hpp @@ -19,7 +19,9 @@ #include #endif + namespace recycler { +constexpr size_t number_instances = 64; namespace detail { namespace util { @@ -52,59 +54,48 @@ class buffer_recycler { /// buffer template static T *get(size_t number_elements, bool manage_content_lifetime = false) { - std::lock_guard guard(mut); - if (!recycler_instance) { - // NOLINTNEXTLINE(cppcoreguidelines-owning-memory) - recycler_instance.reset(new buffer_recycler()); - } + std::lock_guard guard(instance().mut); return buffer_manager::get(number_elements, manage_content_lifetime); } /// Marks an buffer as unused and fit for reusage template static void mark_unused(T *p, size_t number_elements) { - std::lock_guard guard(mut); - if (recycler_instance) { // if the instance was already destroyed all - // buffers are destroyed anyway - return buffer_manager::mark_unused(p, number_elements); - } + std::lock_guard guard(instance().mut); + return buffer_manager::mark_unused(p, number_elements); } /// Increase the reference coutner of a buffer - template - static void increase_usage_counter(T *p, size_t number_elements) noexcept { - std::lock_guard guard(mut); - if (recycler_instance) { // if the instance was already destroyed all - // buffers are destroyed anyway - return buffer_manager::increase_usage_counter( - p, number_elements); - } - } + /* template */ + /* static void increase_usage_counter(T *p, size_t number_elements) noexcept { */ + /* std::lock_guard guard(instance().mut); */ + /* return buffer_manager::increase_usage_counter( */ + /* p, number_elements); */ + /* } */ /// Deallocate all buffers, no matter whether they are marked as used or not static void clean_all() { - std::lock_guard guard(mut); - if (recycler_instance) { - for (const auto &clean_function : - recycler_instance->total_cleanup_callbacks) { - clean_function(); - } + std::lock_guard guard(instance().mut); + for (const auto &clean_function : + instance().total_cleanup_callbacks) { + clean_function(); } - recycler_instance.reset(); } /// Deallocated all currently unused buffer static void clean_unused_buffers() { - std::lock_guard guard(mut); - if (recycler_instance) { - for (const auto &clean_function : - recycler_instance->partial_cleanup_callbacks) { - clean_function(); - } + std::lock_guard guard(instance().mut); + for (const auto &clean_function : + instance().partial_cleanup_callbacks) { + clean_function(); } } // Member variables and methods private: - /// Singleton instance pointer - static std::unique_ptr recycler_instance; + + /// Singleton instance access + static buffer_recycler& instance() { + static buffer_recycler singleton{}; + return singleton; + } /// Callbacks for buffer_manager cleanups - each callback completely destroys /// one buffer_manager std::list> total_cleanup_callbacks; @@ -114,22 +105,20 @@ class buffer_recycler { /// One Mutex to control concurrent access - Since we do not actually ever /// return the singleton instance anywhere, this should hopefully suffice We /// want more fine-grained concurrent access eventually - static std::mutex mut; + std::mutex mut; /// default, private constructor - not automatically constructed due to the /// deleted constructors buffer_recycler() = default; /// Add a callback function that gets executed upon cleanup and destruction static void add_total_cleanup_callback(const std::function &func) { - // This methods assumes instance is initialized since it is a private method - // and all static public methods have guards - recycler_instance->total_cleanup_callbacks.push_back(func); + /* std::lock_guard guard(instance().mut); */ + instance().total_cleanup_callbacks.push_back(func); } /// Add a callback function that gets executed upon partial (unused memory) /// cleanup static void add_partial_cleanup_callback(const std::function &func) { - // This methods assumes instance is initialized since it is a private method - // and all static public methods have guards - recycler_instance->partial_cleanup_callbacks.push_back(func); + /* std::lock_guard guard(instance().mut); */ + instance().partial_cleanup_callbacks.push_back(func); } public: @@ -180,7 +169,7 @@ class buffer_recycler { auto tuple = *iter; if (std::get<1>(tuple) == number_of_elements) { manager_instance->unused_buffer_list.erase(iter); - std::get<2>(tuple)++; // increase usage counter to 1 + /* std::get<2>(tuple)++; // increase usage counter to 1 */ // handle the switch from aggressive to non aggressive reusage (or // vice-versa) @@ -200,7 +189,7 @@ class buffer_recycler { } } - // No unsued buffer found -> Create new one and return it + // No unused buffer found -> Create new one and return it try { Host_Allocator alloc; T *buffer = alloc.allocate(number_of_elements); @@ -249,25 +238,29 @@ class buffer_recycler { auto &tuple = it->second; // sanity checks: assert(std::get<1>(tuple) == number_of_elements); - assert(std::get<2>(tuple) >= 1); - std::get<2>(tuple)--; // decrease usage counter - if (std::get<2>(tuple) == 0) { // not used anymore? - // move to the unused_buffer list - manager_instance->unused_buffer_list.push_front(tuple); - manager_instance->buffer_map.erase(memory_location); - } + // move to the unused_buffer list + manager_instance->unused_buffer_list.push_front(tuple); + manager_instance->buffer_map.erase(memory_location); + + /* assert(std::get<2>(tuple) >= 1); */ + /* std::get<2>(tuple)--; // decrease usage counter */ + /* if (std::get<2>(tuple) == 0) { // not used anymore? */ + /* // move to the unused_buffer list */ + /* manager_instance->unused_buffer_list.push_front(tuple); */ + /* manager_instance->buffer_map.erase(memory_location); */ + /* } */ } - static void increase_usage_counter(T *memory_location, - size_t number_of_elements) noexcept { - auto it = manager_instance->buffer_map.find(memory_location); - assert(it != manager_instance->buffer_map.end()); - auto &tuple = it->second; - // sanity checks: - assert(std::get<1>(tuple) == number_of_elements); - assert(std::get<2>(tuple) >= 1); - std::get<2>(tuple)++; // increase usage counter - } + /* static void increase_usage_counter(T *memory_location, */ + /* size_t number_of_elements) noexcept { */ + /* auto it = manager_instance->buffer_map.find(memory_location); */ + /* assert(it != manager_instance->buffer_map.end()); */ + /* auto &tuple = it->second; */ + /* // sanity checks: */ + /* assert(std::get<1>(tuple) == number_of_elements); */ + /* assert(std::get<2>(tuple) >= 1); */ + /* std::get<2>(tuple)++; // increase usage counter */ + /* } */ private: /// List with all buffers still in usage @@ -281,6 +274,8 @@ class buffer_recycler { #endif /// Singleton instance static std::unique_ptr> manager_instance; + // Array of instances + static std::unique_ptr> manager_instances; /// default, private constructor - not automatically constructed due to the /// deleted constructors buffer_manager() = default; @@ -353,223 +348,6 @@ class buffer_recycler { operator=(buffer_manager &&other) = delete; }; - /// Memory Manager subclass to handle buffers a specific type - template - class mutexless_buffer_manager { - private: - // Tuple content: Pointer to buffer, buffer_size, reference_counter, Flag - // The flag at the end controls whether to buffer content is to be reused as - // well - using buffer_entry_type = std::tuple; - - public: - /// Cleanup and delete this singleton - static void clean() { manager_instance.reset(); } - /// Cleanup all buffers not currently in use - static void clean_unused_buffers_only() { - if (!manager_instance) { - return; - } - for (auto &buffer_tuple : manager_instance->unused_buffer_list) { - Host_Allocator alloc; - if (std::get<3>(buffer_tuple)) { - util::destroy_n(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple)); - } - alloc.deallocate(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple)); - } - manager_instance->unused_buffer_list.clear(); - } - - /// Tries to recycle or create a buffer of type T and size number_elements. - static T *get(size_t number_of_elements, bool manage_content_lifetime) { - if (!manager_instance) { - manager_instance.reset(new mutexless_buffer_manager()); - buffer_recycler::add_total_cleanup_callback(clean); - buffer_recycler::add_partial_cleanup_callback( - clean_unused_buffers_only); - } -#ifdef CPPUDDLE_HAVE_COUNTERS - manager_instance->number_allocation++; -#endif - // Check for unused buffers we can recycle: - for (auto iter = manager_instance->unused_buffer_list.begin(); - iter != manager_instance->unused_buffer_list.end(); iter++) { - auto tuple = *iter; - if (std::get<1>(tuple) == number_of_elements) { - manager_instance->unused_buffer_list.erase(iter); - std::get<2>(tuple)++; // increase usage counter to 1 - - // handle the switch from aggressive to non aggressive reusage (or - // vice-versa) - if (manage_content_lifetime && !std::get<3>(tuple)) { - util::uninitialized_value_construct_n(std::get<0>(tuple), - number_of_elements); - std::get<3>(tuple) = true; - } else if (!manage_content_lifetime && std::get<3>(tuple)) { - util::destroy_n(std::get<0>(tuple), std::get<1>(tuple)); - std::get<3>(tuple) = false; - } - manager_instance->buffer_map.insert({std::get<0>(tuple), tuple}); -#ifdef CPPUDDLE_HAVE_COUNTERS - manager_instance->number_recycling++; -#endif - return std::get<0>(tuple); - } - } - - // No unsued buffer found -> Create new one and return it - try { - Host_Allocator alloc; - T *buffer = alloc.allocate(number_of_elements); - manager_instance->buffer_map.insert( - {buffer, std::make_tuple(buffer, number_of_elements, 1, - manage_content_lifetime)}); -#ifdef CPPUDDLE_HAVE_COUNTERS - manager_instance->number_creation++; -#endif - if (manage_content_lifetime) { - util::uninitialized_value_construct_n(buffer, number_of_elements); - } - return buffer; - } catch (std::bad_alloc &e) { - // not enough memory left! Cleanup and attempt again: - buffer_recycler::clean_unused_buffers(); - - // If there still isn't enough memory left, the caller has to handle it - // We've done all we can in here - Host_Allocator alloc; - T *buffer = alloc.allocate(number_of_elements); - manager_instance->buffer_map.insert( - {buffer, std::make_tuple(buffer, number_of_elements, 1, - manage_content_lifetime)}); -#ifdef CPPUDDLE_HAVE_COUNTERS - manager_instance->number_creation++; - manager_instance->number_bad_alloc++; -#endif - if (manage_content_lifetime) { - util::uninitialized_value_construct_n(buffer, number_of_elements); - } - return buffer; - } - } - - static void mark_unused(T *memory_location, size_t number_of_elements) { - // This will never be called without an instance since all access for this - // method comes from the buffer recycler We can forego the instance - // existence check here - assert(manager_instance); -#ifdef CPPUDDLE_HAVE_COUNTERS - manager_instance->number_dealloacation++; -#endif - auto it = manager_instance->buffer_map.find(memory_location); - assert(it != manager_instance->buffer_map.end()); - auto &tuple = it->second; - // sanity checks: - assert(std::get<1>(tuple) == number_of_elements); - assert(std::get<2>(tuple) >= 1); - std::get<2>(tuple)--; // decrease usage counter - if (std::get<2>(tuple) == 0) { // not used anymore? - // move to the unused_buffer list - manager_instance->unused_buffer_list.push_front(tuple); - manager_instance->buffer_map.erase(memory_location); - } - } - - static void increase_usage_counter(T *memory_location, - size_t number_of_elements) noexcept { - auto it = manager_instance->buffer_map.find(memory_location); - assert(it != manager_instance->buffer_map.end()); - auto &tuple = it->second; - // sanity checks: - assert(std::get<1>(tuple) == number_of_elements); - assert(std::get<2>(tuple) >= 1); - std::get<2>(tuple)++; // increase usage counter - } - - private: - /// List with all buffers still in usage - std::unordered_map buffer_map{}; - /// List with all buffers currently not used - std::list unused_buffer_list{}; -#ifdef CPPUDDLE_HAVE_COUNTERS - /// Performance counters - size_t number_allocation{0}, number_dealloacation{0}; - size_t number_recycling{0}, number_creation{0}, number_bad_alloc{0}; -#endif - /// Singleton instance - static std::unique_ptr> - manager_instance; - /// default, private constructor - not automatically constructed due to the - /// deleted constructors - mutexless_buffer_manager() = default; - - public: - ~mutexless_buffer_manager() { - for (auto &buffer_tuple : unused_buffer_list) { - Host_Allocator alloc; - if (std::get<3>(buffer_tuple)) { - util::destroy_n(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple)); - } - alloc.deallocate(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple)); - } - for (auto &map_tuple : buffer_map) { - auto buffer_tuple = map_tuple.second; - Host_Allocator alloc; - if (std::get<3>(buffer_tuple)) { - util::destroy_n(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple)); - } - alloc.deallocate(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple)); - } -#ifdef CPPUDDLE_HAVE_COUNTERS - // Print performance counters - size_t number_cleaned = unused_buffer_list.size() + buffer_map.size(); - std::cout << "\nBuffer mananger destructor for buffers of type " - << typeid(Host_Allocator).name() << "->" << typeid(T).name() - << ":" << std::endl - << "----------------------------------------------------" - << std::endl - << "--> Number of bad_allocs that triggered garbage " - "collection: " - << number_bad_alloc << std::endl - << "--> Number of buffers that got requested from this " - "manager: " - << number_allocation << std::endl - << "--> Number of times an unused buffer got recycled for a " - "request: " - << number_recycling << std::endl - << "--> Number of times a new buffer had to be created for a " - "request: " - << number_creation << std::endl - << "--> Number cleaned up buffers: " - " " - << number_cleaned << std::endl - << "--> Number of buffers that were marked as used upon " - "cleanup: " - << buffer_map.size() << std::endl - << "==> Recycle rate: " - " " - << static_cast(number_recycling) / number_allocation * - 100.0f - << "%" << std::endl; - // assert(buffer_map.size() == 0); // Were there any buffers still used? -#endif - unused_buffer_list.clear(); - buffer_map.clear(); - } - - public: // Putting deleted constructors in public gives more useful error - // messages - // Bunch of constructors we don't need - mutexless_buffer_manager( - mutexless_buffer_manager const &other) = delete; - mutexless_buffer_manager operator=( - mutexless_buffer_manager const &other) = delete; - mutexless_buffer_manager( - mutexless_buffer_manager &&other) = delete; - mutexless_buffer_manager - operator=(mutexless_buffer_manager &&other) = delete; - }; - public: // Putting deleted constructors in public gives more useful error messages // Bunch of constructors we don't need @@ -583,11 +361,6 @@ template std::unique_ptr> buffer_recycler::buffer_manager::manager_instance{}; -template -std::unique_ptr> - buffer_recycler::mutexless_buffer_manager< - T, Host_Allocator>::manager_instance{}; - template struct recycle_allocator { using value_type = T; recycle_allocator() noexcept = default; @@ -606,9 +379,9 @@ template struct recycle_allocator { ::new (static_cast(p)) T(std::forward(args)...); } void destroy(T *p) { p->~T(); } - void increase_usage_counter(T *p, size_t n) { - buffer_recycler::increase_usage_counter(p, n); - } + /* void increase_usage_counter(T *p, size_t n) { */ + /* buffer_recycler::increase_usage_counter(p, n); */ + /* } */ }; template constexpr bool @@ -647,9 +420,9 @@ struct aggressive_recycle_allocator { // Do nothing here - Contents will be destroyed when the buffer manager is // destroyed, not before } - void increase_usage_counter(T *p, size_t n) { - buffer_recycler::increase_usage_counter(p, n); - } + /* void increase_usage_counter(T *p, size_t n) { */ + /* buffer_recycler::increase_usage_counter(p, n); */ + /* } */ }; template constexpr bool diff --git a/include/kokkos_buffer_util.hpp b/include/kokkos_buffer_util.hpp index eaadf2aa..a95b9ec0 100644 --- a/include/kokkos_buffer_util.hpp +++ b/include/kokkos_buffer_util.hpp @@ -33,12 +33,13 @@ class aggregated_recycled_view : public kokkos_type { explicit aggregated_recycled_view(alloc_type &alloc, Args... args) : kokkos_type( alloc.allocate(kokkos_type::required_allocation_size(args...) / - sizeof(element_type)), + sizeof(element_type)), args...), total_elements(kokkos_type::required_allocation_size(args...) / - sizeof(element_type)), allocator(alloc), - data_ref_counter(this->data(), view_deleter(alloc, total_elements)) - {} + sizeof(element_type)), + allocator(alloc), + data_ref_counter(this->data(), view_deleter( + alloc, total_elements)) {} aggregated_recycled_view( const aggregated_recycled_view &other) @@ -80,6 +81,7 @@ class recycled_view : public kokkos_type { private: static alloc_type allocator; size_t total_elements{0}; + std::shared_ptr data_ref_counter; public: template @@ -89,19 +91,23 @@ class recycled_view : public kokkos_type { sizeof(element_type)), args...), total_elements(kokkos_type::required_allocation_size(args...) / - sizeof(element_type)) {} + sizeof(element_type)), + data_ref_counter(this->data(), view_deleter( + allocator, total_elements)) {} recycled_view( const recycled_view &other) : kokkos_type(other) { total_elements = other.total_elements; + data_ref_counter = other.data_ref_counter; - allocator.increase_usage_counter(this->data(), this->total_elements); + /* allocator.increase_usage_counter(this->data(), this->total_elements); */ } recycled_view & operator=(const recycled_view &other) { - allocator.deallocate(this->data(), total_elements); + data_ref_counter = other.data_ref_counter; + /* allocator.deallocate(this->data(), total_elements); */ kokkos_type::operator=(other); total_elements = other.total_elements; allocator.increase_usage_counter(other.data(), other.total_elements); @@ -111,20 +117,22 @@ class recycled_view : public kokkos_type { recycled_view( recycled_view &&other) noexcept : kokkos_type(other) { + data_ref_counter = other.data_ref_counter; total_elements = other.total_elements; - allocator.increase_usage_counter(other.data(), other.total_elements); + /* allocator.increase_usage_counter(other.data(), other.total_elements); */ } recycled_view &operator=( recycled_view &&other) noexcept { - allocator.deallocate(this->data(), total_elements); + data_ref_counter = other.data_ref_counter; + /* allocator.deallocate(this->data(), total_elements); */ kokkos_type::operator=(other); total_elements = other.total_elements; - allocator.increase_usage_counter(other.data(), other.total_elements); + /* allocator.increase_usage_counter(other.data(), other.total_elements); */ return *this; } - ~recycled_view() { allocator.deallocate(this->data(), total_elements); } + ~recycled_view() { /*allocator.deallocate(this->data(), total_elements); */ } }; template diff --git a/src/buffer_manager_definitions.cpp b/src/buffer_manager_definitions.cpp index 403744ef..34e635b0 100644 --- a/src/buffer_manager_definitions.cpp +++ b/src/buffer_manager_definitions.cpp @@ -3,9 +3,9 @@ // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) -#include "../include/buffer_manager.hpp" +//#include "../include/buffer_manager.hpp" // Instance defintions -std::unique_ptr - recycler::detail::buffer_recycler::recycler_instance{}; -std::mutex recycler::detail::buffer_recycler::mut{}; +/* std::unique_ptr */ +/* recycler::detail::buffer_recycler::recycler_instance{}; */ +/* std::mutex recycler::detail::buffer_recycler::mut{}; */ From 502e9524b1bc3c3ba0665bdf2233fca73f47eabc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= Date: Sat, 18 Mar 2023 01:32:00 +0100 Subject: [PATCH 02/46] Use static instance method for buffer manager --- include/buffer_manager.hpp | 139 ++++++++++++++++----------------- include/kokkos_buffer_util.hpp | 7 +- 2 files changed, 67 insertions(+), 79 deletions(-) diff --git a/include/buffer_manager.hpp b/include/buffer_manager.hpp index ce96a913..1e583349 100644 --- a/include/buffer_manager.hpp +++ b/include/buffer_manager.hpp @@ -21,7 +21,7 @@ namespace recycler { -constexpr size_t number_instances = 64; +constexpr size_t number_instances = 1; namespace detail { namespace util { @@ -122,53 +122,53 @@ class buffer_recycler { } public: - ~buffer_recycler() = default; // public destructor for unique_ptr instance + /* ~buffer_recycler() = default; // public destructor for unique_ptr instance */ + ~buffer_recycler() { + clean_all(); + } // Subclasses private: /// Memory Manager subclass to handle buffers a specific type template class buffer_manager { private: - // Tuple content: Pointer to buffer, buffer_size, reference_counter, Flag + // Tuple content: Pointer to buffer, buffer_size, location ID, Flag // The flag at the end controls whether to buffer content is to be reused as // well using buffer_entry_type = std::tuple; public: /// Cleanup and delete this singleton - static void clean() { manager_instance.reset(); } + static void clean() { + instance().reset(new buffer_manager[number_instances]); + } /// Cleanup all buffers not currently in use static void clean_unused_buffers_only() { - if (!manager_instance) { - return; - } - for (auto &buffer_tuple : manager_instance->unused_buffer_list) { - Host_Allocator alloc; - if (std::get<3>(buffer_tuple)) { - util::destroy_n(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple)); + for (auto i = 0; i < number_instances; i++) { + for (auto &buffer_tuple : instance()[i].unused_buffer_list) { + Host_Allocator alloc; + if (std::get<3>(buffer_tuple)) { + util::destroy_n(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple)); + } + alloc.deallocate(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple)); } - alloc.deallocate(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple)); + instance()[i].unused_buffer_list.clear(); } - manager_instance->unused_buffer_list.clear(); } /// Tries to recycle or create a buffer of type T and size number_elements. - static T *get(size_t number_of_elements, bool manage_content_lifetime) { - if (!manager_instance) { - manager_instance.reset(new buffer_manager()); - buffer_recycler::add_total_cleanup_callback(clean); - buffer_recycler::add_partial_cleanup_callback( - clean_unused_buffers_only); - } + static T *get(size_t number_of_elements, bool manage_content_lifetime, size_t id = 0) { + init_callbacks_once(); + #ifdef CPPUDDLE_HAVE_COUNTERS - manager_instance->number_allocation++; + instance()[id].number_allocation++; #endif // Check for unused buffers we can recycle: - for (auto iter = manager_instance->unused_buffer_list.begin(); - iter != manager_instance->unused_buffer_list.end(); iter++) { + for (auto iter = instance()[id].unused_buffer_list.begin(); + iter != instance()[id].unused_buffer_list.end(); iter++) { auto tuple = *iter; if (std::get<1>(tuple) == number_of_elements) { - manager_instance->unused_buffer_list.erase(iter); + instance()[id].unused_buffer_list.erase(iter); /* std::get<2>(tuple)++; // increase usage counter to 1 */ // handle the switch from aggressive to non aggressive reusage (or @@ -181,9 +181,9 @@ class buffer_recycler { util::destroy_n(std::get<0>(tuple), std::get<1>(tuple)); std::get<3>(tuple) = false; } - manager_instance->buffer_map.insert({std::get<0>(tuple), tuple}); + instance()[id].buffer_map.insert({std::get<0>(tuple), tuple}); #ifdef CPPUDDLE_HAVE_COUNTERS - manager_instance->number_recycling++; + instance()[id].number_recycling++; #endif return std::get<0>(tuple); } @@ -193,11 +193,11 @@ class buffer_recycler { try { Host_Allocator alloc; T *buffer = alloc.allocate(number_of_elements); - manager_instance->buffer_map.insert( + instance()[id].buffer_map.insert( {buffer, std::make_tuple(buffer, number_of_elements, 1, manage_content_lifetime)}); #ifdef CPPUDDLE_HAVE_COUNTERS - manager_instance->number_creation++; + instance()[id].number_creation++; #endif if (manage_content_lifetime) { util::uninitialized_value_construct_n(buffer, number_of_elements); @@ -211,12 +211,12 @@ class buffer_recycler { // We've done all we can in here Host_Allocator alloc; T *buffer = alloc.allocate(number_of_elements); - manager_instance->buffer_map.insert( + instance()[id].buffer_map.insert( {buffer, std::make_tuple(buffer, number_of_elements, 1, manage_content_lifetime)}); #ifdef CPPUDDLE_HAVE_COUNTERS - manager_instance->number_creation++; - manager_instance->number_bad_alloc++; + instance()[id].number_creation++; + instance()[id].number_bad_alloc++; #endif if (manage_content_lifetime) { util::uninitialized_value_construct_n(buffer, number_of_elements); @@ -225,42 +225,20 @@ class buffer_recycler { } } - static void mark_unused(T *memory_location, size_t number_of_elements) { - // This will never be called without an instance since all access for this - // method comes from the buffer recycler We can forego the instance - // existence check here - assert(manager_instance); + static void mark_unused(T *memory_location, size_t number_of_elements, size_t id = 0) { #ifdef CPPUDDLE_HAVE_COUNTERS - manager_instance->number_dealloacation++; + instance()[id].number_dealloacation++; #endif - auto it = manager_instance->buffer_map.find(memory_location); - assert(it != manager_instance->buffer_map.end()); + auto it = instance()[id].buffer_map.find(memory_location); + assert(it != instance()[id].buffer_map.end()); auto &tuple = it->second; // sanity checks: assert(std::get<1>(tuple) == number_of_elements); // move to the unused_buffer list - manager_instance->unused_buffer_list.push_front(tuple); - manager_instance->buffer_map.erase(memory_location); - - /* assert(std::get<2>(tuple) >= 1); */ - /* std::get<2>(tuple)--; // decrease usage counter */ - /* if (std::get<2>(tuple) == 0) { // not used anymore? */ - /* // move to the unused_buffer list */ - /* manager_instance->unused_buffer_list.push_front(tuple); */ - /* manager_instance->buffer_map.erase(memory_location); */ - /* } */ - } + instance()[id].unused_buffer_list.push_front(tuple); + instance()[id].buffer_map.erase(memory_location); - /* static void increase_usage_counter(T *memory_location, */ - /* size_t number_of_elements) noexcept { */ - /* auto it = manager_instance->buffer_map.find(memory_location); */ - /* assert(it != manager_instance->buffer_map.end()); */ - /* auto &tuple = it->second; */ - /* // sanity checks: */ - /* assert(std::get<1>(tuple) == number_of_elements); */ - /* assert(std::get<2>(tuple) >= 1); */ - /* std::get<2>(tuple)++; // increase usage counter */ - /* } */ + } private: /// List with all buffers still in usage @@ -273,15 +251,37 @@ class buffer_recycler { size_t number_recycling{0}, number_creation{0}, number_bad_alloc{0}; #endif /// Singleton instance - static std::unique_ptr> manager_instance; - // Array of instances - static std::unique_ptr> manager_instances; + /* static std::unique_ptr> manager_instance; */ /// default, private constructor - not automatically constructed due to the /// deleted constructors buffer_manager() = default; + buffer_manager& + operator=(buffer_manager const &other) = default; + buffer_manager& + operator=(buffer_manager &&other) = delete; + static std::unique_ptr& instance(void) { + /* static std::array instances{{}}; */ + static std::unique_ptr instances{ + new buffer_manager[number_instances]}; + return instances; + } + static void init_callbacks_once(void) { + static std::once_flag flag; + std::call_once(flag, []() { + buffer_recycler::add_total_cleanup_callback(clean); + buffer_recycler::add_partial_cleanup_callback( + clean_unused_buffers_only); + }); + } + public: ~buffer_manager() { + if (number_allocation == 0 && number_recycling == 0 && + number_bad_alloc == 0 && number_creation == 0 && + unused_buffer_list.empty() && buffer_map.empty()) { + return; + } for (auto &buffer_tuple : unused_buffer_list) { Host_Allocator alloc; if (std::get<3>(buffer_tuple)) { @@ -340,12 +340,8 @@ class buffer_recycler { // Bunch of constructors we don't need buffer_manager( buffer_manager const &other) = delete; - buffer_manager - operator=(buffer_manager const &other) = delete; buffer_manager( buffer_manager &&other) = delete; - buffer_manager - operator=(buffer_manager &&other) = delete; }; public: @@ -357,9 +353,9 @@ class buffer_recycler { buffer_recycler operator=(buffer_recycler &&other) = delete; }; -template -std::unique_ptr> - buffer_recycler::buffer_manager::manager_instance{}; +/* template */ +/* std::unique_ptr> */ +/* buffer_recycler::buffer_manager::manager_instance{}; */ template struct recycle_allocator { using value_type = T; @@ -379,9 +375,6 @@ template struct recycle_allocator { ::new (static_cast(p)) T(std::forward(args)...); } void destroy(T *p) { p->~T(); } - /* void increase_usage_counter(T *p, size_t n) { */ - /* buffer_recycler::increase_usage_counter(p, n); */ - /* } */ }; template constexpr bool diff --git a/include/kokkos_buffer_util.hpp b/include/kokkos_buffer_util.hpp index a95b9ec0..2b45406e 100644 --- a/include/kokkos_buffer_util.hpp +++ b/include/kokkos_buffer_util.hpp @@ -101,13 +101,11 @@ class recycled_view : public kokkos_type { total_elements = other.total_elements; data_ref_counter = other.data_ref_counter; - /* allocator.increase_usage_counter(this->data(), this->total_elements); */ } recycled_view & operator=(const recycled_view &other) { data_ref_counter = other.data_ref_counter; - /* allocator.deallocate(this->data(), total_elements); */ kokkos_type::operator=(other); total_elements = other.total_elements; allocator.increase_usage_counter(other.data(), other.total_elements); @@ -119,20 +117,17 @@ class recycled_view : public kokkos_type { : kokkos_type(other) { data_ref_counter = other.data_ref_counter; total_elements = other.total_elements; - /* allocator.increase_usage_counter(other.data(), other.total_elements); */ } recycled_view &operator=( recycled_view &&other) noexcept { data_ref_counter = other.data_ref_counter; - /* allocator.deallocate(this->data(), total_elements); */ kokkos_type::operator=(other); total_elements = other.total_elements; - /* allocator.increase_usage_counter(other.data(), other.total_elements); */ return *this; } - ~recycled_view() { /*allocator.deallocate(this->data(), total_elements); */ } + ~recycled_view() { } }; template From 5cf099a93f181406270703adf8c05fe7bb390ac7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= Date: Sat, 18 Mar 2023 01:33:31 +0100 Subject: [PATCH 03/46] Load correct cuda --- .jenkins/Jenkinsfile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.jenkins/Jenkinsfile b/.jenkins/Jenkinsfile index 0353a649..32033164 100644 --- a/.jenkins/Jenkinsfile +++ b/.jenkins/Jenkinsfile @@ -72,7 +72,7 @@ pipeline { steps { dir('CPPuddle') { sh ''' - module load cuda + module load cuda/11.2.2 #./scripts/build_dependencies.sh ${build_type} ${compiler} ''' } @@ -82,7 +82,7 @@ pipeline { steps { dir('CPPuddle') { sh ''' - module load cuda + module load cuda/11.2.2 ./scripts/configure_build_directory.sh ${build_type} ${compiler} cd build/${compiler}-${build_type} make -j4 @@ -94,7 +94,7 @@ pipeline { steps { dir('CPPuddle') { sh ''' - module load cuda + module load cuda/11.2.2 cd build/${compiler}-${build_type} ctest -j4 ''' From 93cb5d27f01105fc8b22b91c9bbed88fc6dbdf85 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= Date: Fri, 28 Apr 2023 16:55:31 +0200 Subject: [PATCH 04/46] WIP --- include/buffer_manager.hpp | 105 ++++++++++++++++++++++++++----------- 1 file changed, 73 insertions(+), 32 deletions(-) diff --git a/include/buffer_manager.hpp b/include/buffer_manager.hpp index 1e583349..7dbeebad 100644 --- a/include/buffer_manager.hpp +++ b/include/buffer_manager.hpp @@ -19,7 +19,35 @@ #include #endif - +// TODO Add location +// Support three modes: +// - Location unaware allocator +// --> Ignores tuple entry regarding location +// --> Use buffer_manager id == 0 by default? +// - Location aware allocator (stateful, contains the index) +// --> Still use buffer manager ID 0 +// --> Respects tuple entry regarding location +// - Location separated allocator (stateful, contains the index) +// --> One buffer manager per location +// --> allocate (read-lock) +// --> deallocate (write-lock) +// --> allocate (write lock --> move entry to used list?) +// - Location separated allocator (not stateful, but relies on a hashmap)? -- probably not required + +// TODO Remove hashmap +// Now that the reference counting is gone +// there's no need to keep the hash map is there? +// --> Get rid of the hash map +// --> Rework when to lock accordingly + +// Give location when allocating (get buffer tied to location) +// Store location in tuple +// Have one buffer manager per location +// Location optional when deallocating +// -> Either search all buffer managers +// -> Or use a hashmap +// Locking optional: do not delete entries, just increment/decrement an atomic usage counter Hint (only have a read lock, use write lock only if we need to create a new buffer +// increment when wanting to use it (check if ret=1, if ret>1 continue search and reset to 1 namespace recycler { constexpr size_t number_instances = 1; namespace detail { @@ -135,7 +163,7 @@ class buffer_recycler { // Tuple content: Pointer to buffer, buffer_size, location ID, Flag // The flag at the end controls whether to buffer content is to be reused as // well - using buffer_entry_type = std::tuple; + using buffer_entry_type = std::tuple, bool>; public: /// Cleanup and delete this singleton @@ -164,39 +192,48 @@ class buffer_recycler { instance()[id].number_allocation++; #endif // Check for unused buffers we can recycle: + // TODO Add Read lock for (auto iter = instance()[id].unused_buffer_list.begin(); iter != instance()[id].unused_buffer_list.end(); iter++) { - auto tuple = *iter; - if (std::get<1>(tuple) == number_of_elements) { - instance()[id].unused_buffer_list.erase(iter); - /* std::get<2>(tuple)++; // increase usage counter to 1 */ - - // handle the switch from aggressive to non aggressive reusage (or - // vice-versa) - if (manage_content_lifetime && !std::get<3>(tuple)) { - util::uninitialized_value_construct_n(std::get<0>(tuple), - number_of_elements); - std::get<3>(tuple) = true; - } else if (!manage_content_lifetime && std::get<3>(tuple)) { - util::destroy_n(std::get<0>(tuple), std::get<1>(tuple)); - std::get<3>(tuple) = false; - } - instance()[id].buffer_map.insert({std::get<0>(tuple), tuple}); + auto &tuple = *iter; + if (std::get<1>(tuple) == number_of_elements && std::get<2>(tuple) == 0) { + const size_t life_counter = std::get<2>(tuple)++; // increase usage counter to 1 + if (life_counter == 1) { // Check if we're the first one to increase + + // handle the switch from aggressive to non aggressive reusage (or + // vice-versa) + if (manage_content_lifetime && !std::get<3>(tuple)) { + util::uninitialized_value_construct_n(std::get<0>(tuple), + number_of_elements); + std::get<3>(tuple) = true; + } else if (!manage_content_lifetime && std::get<3>(tuple)) { + util::destroy_n(std::get<0>(tuple), std::get<1>(tuple)); + std::get<3>(tuple) = false; + } + // TODO buffer map? + /* instance()[id].buffer_map.insert({std::get<0>(tuple), tuple}); */ #ifdef CPPUDDLE_HAVE_COUNTERS - instance()[id].number_recycling++; + instance()[id].number_recycling++; #endif - return std::get<0>(tuple); + return std::get<0>(tuple); + } else { // other thread beat us: reset to 1 and continue search + std::get<2>(tuple) = 1; + } } } // No unused buffer found -> Create new one and return it try { Host_Allocator alloc; + // TODO Allocate outside of write lock T *buffer = alloc.allocate(number_of_elements); - instance()[id].buffer_map.insert( - {buffer, std::make_tuple(buffer, number_of_elements, 1, - manage_content_lifetime)}); -#ifdef CPPUDDLE_HAVE_COUNTERS + // TODO push into during write lock + /* auto buffer_tuple = std::make_tuple(buffer, number_of_elements, 1, */ + /* manage_content_lifetime); */ + instance()[id].unused_buffer_list.emplace_back(buffer, number_of_elements, 1, + manage_content_lifetime); + /* instance()[id].buffer_map.insert({buffer, buffer_tuple}); */ +#ifdef CPPUDDLE_HAVE_COUNTERS instance()[id].number_creation++; #endif if (manage_content_lifetime) { @@ -211,9 +248,11 @@ class buffer_recycler { // We've done all we can in here Host_Allocator alloc; T *buffer = alloc.allocate(number_of_elements); - instance()[id].buffer_map.insert( - {buffer, std::make_tuple(buffer, number_of_elements, 1, - manage_content_lifetime)}); + /* auto buffer_tuple = std::make_tuple(buffer, number_of_elements, 1, */ + /* manage_content_lifetime); */ + instance()[id].unused_buffer_list.emplace_back(buffer, number_of_elements, 1, + manage_content_lifetime); + /* instance()[id].buffer_map.insert({buffer, buffer_tuple}); */ #ifdef CPPUDDLE_HAVE_COUNTERS instance()[id].number_creation++; instance()[id].number_bad_alloc++; @@ -232,12 +271,13 @@ class buffer_recycler { auto it = instance()[id].buffer_map.find(memory_location); assert(it != instance()[id].buffer_map.end()); auto &tuple = it->second; - // sanity checks: assert(std::get<1>(tuple) == number_of_elements); - // move to the unused_buffer list - instance()[id].unused_buffer_list.push_front(tuple); - instance()[id].buffer_map.erase(memory_location); + assert(std::get<2>(tuple) == 1); + // move to the unused_buffer list + /* instance()[id].unused_buffer_list.push_front(tuple); */ + /* instance()[id].buffer_map.erase(memory_location); */ + std::get<2>(tuple) = 0; // mark as fit for reusage } private: @@ -290,7 +330,8 @@ class buffer_recycler { alloc.deallocate(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple)); } for (auto &map_tuple : buffer_map) { - auto buffer_tuple = map_tuple.second; + // + auto &buffer_tuple = map_tuple.second; Host_Allocator alloc; if (std::get<3>(buffer_tuple)) { util::destroy_n(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple)); From a2924b1cfc93133e47d6fd70fd085887b15f89fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= Date: Fri, 28 Apr 2023 17:52:41 +0200 Subject: [PATCH 05/46] Revert "WIP" This reverts commit 93cb5d27f01105fc8b22b91c9bbed88fc6dbdf85. --- include/buffer_manager.hpp | 105 +++++++++++-------------------------- 1 file changed, 32 insertions(+), 73 deletions(-) diff --git a/include/buffer_manager.hpp b/include/buffer_manager.hpp index 7dbeebad..1e583349 100644 --- a/include/buffer_manager.hpp +++ b/include/buffer_manager.hpp @@ -19,35 +19,7 @@ #include #endif -// TODO Add location -// Support three modes: -// - Location unaware allocator -// --> Ignores tuple entry regarding location -// --> Use buffer_manager id == 0 by default? -// - Location aware allocator (stateful, contains the index) -// --> Still use buffer manager ID 0 -// --> Respects tuple entry regarding location -// - Location separated allocator (stateful, contains the index) -// --> One buffer manager per location -// --> allocate (read-lock) -// --> deallocate (write-lock) -// --> allocate (write lock --> move entry to used list?) -// - Location separated allocator (not stateful, but relies on a hashmap)? -- probably not required - -// TODO Remove hashmap -// Now that the reference counting is gone -// there's no need to keep the hash map is there? -// --> Get rid of the hash map -// --> Rework when to lock accordingly - -// Give location when allocating (get buffer tied to location) -// Store location in tuple -// Have one buffer manager per location -// Location optional when deallocating -// -> Either search all buffer managers -// -> Or use a hashmap -// Locking optional: do not delete entries, just increment/decrement an atomic usage counter Hint (only have a read lock, use write lock only if we need to create a new buffer -// increment when wanting to use it (check if ret=1, if ret>1 continue search and reset to 1 + namespace recycler { constexpr size_t number_instances = 1; namespace detail { @@ -163,7 +135,7 @@ class buffer_recycler { // Tuple content: Pointer to buffer, buffer_size, location ID, Flag // The flag at the end controls whether to buffer content is to be reused as // well - using buffer_entry_type = std::tuple, bool>; + using buffer_entry_type = std::tuple; public: /// Cleanup and delete this singleton @@ -192,48 +164,39 @@ class buffer_recycler { instance()[id].number_allocation++; #endif // Check for unused buffers we can recycle: - // TODO Add Read lock for (auto iter = instance()[id].unused_buffer_list.begin(); iter != instance()[id].unused_buffer_list.end(); iter++) { - auto &tuple = *iter; - if (std::get<1>(tuple) == number_of_elements && std::get<2>(tuple) == 0) { - const size_t life_counter = std::get<2>(tuple)++; // increase usage counter to 1 - if (life_counter == 1) { // Check if we're the first one to increase - - // handle the switch from aggressive to non aggressive reusage (or - // vice-versa) - if (manage_content_lifetime && !std::get<3>(tuple)) { - util::uninitialized_value_construct_n(std::get<0>(tuple), - number_of_elements); - std::get<3>(tuple) = true; - } else if (!manage_content_lifetime && std::get<3>(tuple)) { - util::destroy_n(std::get<0>(tuple), std::get<1>(tuple)); - std::get<3>(tuple) = false; - } - // TODO buffer map? - /* instance()[id].buffer_map.insert({std::get<0>(tuple), tuple}); */ + auto tuple = *iter; + if (std::get<1>(tuple) == number_of_elements) { + instance()[id].unused_buffer_list.erase(iter); + /* std::get<2>(tuple)++; // increase usage counter to 1 */ + + // handle the switch from aggressive to non aggressive reusage (or + // vice-versa) + if (manage_content_lifetime && !std::get<3>(tuple)) { + util::uninitialized_value_construct_n(std::get<0>(tuple), + number_of_elements); + std::get<3>(tuple) = true; + } else if (!manage_content_lifetime && std::get<3>(tuple)) { + util::destroy_n(std::get<0>(tuple), std::get<1>(tuple)); + std::get<3>(tuple) = false; + } + instance()[id].buffer_map.insert({std::get<0>(tuple), tuple}); #ifdef CPPUDDLE_HAVE_COUNTERS - instance()[id].number_recycling++; + instance()[id].number_recycling++; #endif - return std::get<0>(tuple); - } else { // other thread beat us: reset to 1 and continue search - std::get<2>(tuple) = 1; - } + return std::get<0>(tuple); } } // No unused buffer found -> Create new one and return it try { Host_Allocator alloc; - // TODO Allocate outside of write lock T *buffer = alloc.allocate(number_of_elements); - // TODO push into during write lock - /* auto buffer_tuple = std::make_tuple(buffer, number_of_elements, 1, */ - /* manage_content_lifetime); */ - instance()[id].unused_buffer_list.emplace_back(buffer, number_of_elements, 1, - manage_content_lifetime); - /* instance()[id].buffer_map.insert({buffer, buffer_tuple}); */ -#ifdef CPPUDDLE_HAVE_COUNTERS + instance()[id].buffer_map.insert( + {buffer, std::make_tuple(buffer, number_of_elements, 1, + manage_content_lifetime)}); +#ifdef CPPUDDLE_HAVE_COUNTERS instance()[id].number_creation++; #endif if (manage_content_lifetime) { @@ -248,11 +211,9 @@ class buffer_recycler { // We've done all we can in here Host_Allocator alloc; T *buffer = alloc.allocate(number_of_elements); - /* auto buffer_tuple = std::make_tuple(buffer, number_of_elements, 1, */ - /* manage_content_lifetime); */ - instance()[id].unused_buffer_list.emplace_back(buffer, number_of_elements, 1, - manage_content_lifetime); - /* instance()[id].buffer_map.insert({buffer, buffer_tuple}); */ + instance()[id].buffer_map.insert( + {buffer, std::make_tuple(buffer, number_of_elements, 1, + manage_content_lifetime)}); #ifdef CPPUDDLE_HAVE_COUNTERS instance()[id].number_creation++; instance()[id].number_bad_alloc++; @@ -271,13 +232,12 @@ class buffer_recycler { auto it = instance()[id].buffer_map.find(memory_location); assert(it != instance()[id].buffer_map.end()); auto &tuple = it->second; + // sanity checks: assert(std::get<1>(tuple) == number_of_elements); - assert(std::get<2>(tuple) == 1); - // move to the unused_buffer list - /* instance()[id].unused_buffer_list.push_front(tuple); */ - /* instance()[id].buffer_map.erase(memory_location); */ - std::get<2>(tuple) = 0; // mark as fit for reusage + instance()[id].unused_buffer_list.push_front(tuple); + instance()[id].buffer_map.erase(memory_location); + } private: @@ -330,8 +290,7 @@ class buffer_recycler { alloc.deallocate(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple)); } for (auto &map_tuple : buffer_map) { - // - auto &buffer_tuple = map_tuple.second; + auto buffer_tuple = map_tuple.second; Host_Allocator alloc; if (std::get<3>(buffer_tuple)) { util::destroy_n(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple)); From e61a52209c2a4f3a0cc4e7b1a8a8f0491282335d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= Date: Wed, 17 May 2023 18:39:42 -0500 Subject: [PATCH 06/46] Support multiple locations --- include/buffer_manager.hpp | 73 +++++++++++++++++++++++++------------- 1 file changed, 49 insertions(+), 24 deletions(-) diff --git a/include/buffer_manager.hpp b/include/buffer_manager.hpp index 1e583349..7dc2a099 100644 --- a/include/buffer_manager.hpp +++ b/include/buffer_manager.hpp @@ -12,6 +12,8 @@ #include #include #include +#include +#include #include #include @@ -21,7 +23,7 @@ namespace recycler { -constexpr size_t number_instances = 1; +constexpr size_t number_instances = 4; namespace detail { namespace util { @@ -157,18 +159,24 @@ class buffer_recycler { } /// Tries to recycle or create a buffer of type T and size number_elements. - static T *get(size_t number_of_elements, bool manage_content_lifetime, size_t id = 0) { + static T *get(size_t number_of_elements, bool manage_content_lifetime, + std::optional location_hint = std::nullopt) { init_callbacks_once(); + size_t location_id = 1; + if (location_hint) + location_id = location_hint.value(); + + #ifdef CPPUDDLE_HAVE_COUNTERS - instance()[id].number_allocation++; + instance()[location_id].number_allocation++; #endif // Check for unused buffers we can recycle: - for (auto iter = instance()[id].unused_buffer_list.begin(); - iter != instance()[id].unused_buffer_list.end(); iter++) { + for (auto iter = instance()[location_id].unused_buffer_list.begin(); + iter != instance()[location_id].unused_buffer_list.end(); iter++) { auto tuple = *iter; if (std::get<1>(tuple) == number_of_elements) { - instance()[id].unused_buffer_list.erase(iter); + instance()[location_id].unused_buffer_list.erase(iter); /* std::get<2>(tuple)++; // increase usage counter to 1 */ // handle the switch from aggressive to non aggressive reusage (or @@ -181,9 +189,9 @@ class buffer_recycler { util::destroy_n(std::get<0>(tuple), std::get<1>(tuple)); std::get<3>(tuple) = false; } - instance()[id].buffer_map.insert({std::get<0>(tuple), tuple}); + instance()[location_id].buffer_map.insert({std::get<0>(tuple), tuple}); #ifdef CPPUDDLE_HAVE_COUNTERS - instance()[id].number_recycling++; + instance()[location_id].number_recycling++; #endif return std::get<0>(tuple); } @@ -193,11 +201,11 @@ class buffer_recycler { try { Host_Allocator alloc; T *buffer = alloc.allocate(number_of_elements); - instance()[id].buffer_map.insert( + instance()[location_id].buffer_map.insert( {buffer, std::make_tuple(buffer, number_of_elements, 1, manage_content_lifetime)}); #ifdef CPPUDDLE_HAVE_COUNTERS - instance()[id].number_creation++; + instance()[location_id].number_creation++; #endif if (manage_content_lifetime) { util::uninitialized_value_construct_n(buffer, number_of_elements); @@ -211,12 +219,12 @@ class buffer_recycler { // We've done all we can in here Host_Allocator alloc; T *buffer = alloc.allocate(number_of_elements); - instance()[id].buffer_map.insert( + instance()[location_id].buffer_map.insert( {buffer, std::make_tuple(buffer, number_of_elements, 1, manage_content_lifetime)}); #ifdef CPPUDDLE_HAVE_COUNTERS - instance()[id].number_creation++; - instance()[id].number_bad_alloc++; + instance()[location_id].number_creation++; + instance()[location_id].number_bad_alloc++; #endif if (manage_content_lifetime) { util::uninitialized_value_construct_n(buffer, number_of_elements); @@ -225,19 +233,36 @@ class buffer_recycler { } } - static void mark_unused(T *memory_location, size_t number_of_elements, size_t id = 0) { + static void mark_unused(T *memory_location, size_t number_of_elements, + std::optional location_hint = std::nullopt) { + size_t locations_start = 0; + size_t locations_end = number_instances; + if (location_hint) { + locations_start = location_hint.value(); + locations_end = location_hint.value() + 1; + } + + bool found = false; + for(size_t location_d = locations_start; location_d < locations_end; location_d++) { + if (instance()[location_d].buffer_map.find(memory_location) != + instance()[location_d].buffer_map.end()) { + found = true; #ifdef CPPUDDLE_HAVE_COUNTERS - instance()[id].number_dealloacation++; + instance()[location_d].number_dealloacation++; #endif - auto it = instance()[id].buffer_map.find(memory_location); - assert(it != instance()[id].buffer_map.end()); - auto &tuple = it->second; - // sanity checks: - assert(std::get<1>(tuple) == number_of_elements); - // move to the unused_buffer list - instance()[id].unused_buffer_list.push_front(tuple); - instance()[id].buffer_map.erase(memory_location); - + auto it = instance()[location_d].buffer_map.find(memory_location); + assert(it != instance()[location_d].buffer_map.end()); + auto &tuple = it->second; + // sanity checks: + assert(std::get<1>(tuple) == number_of_elements); + // move to the unused_buffer list + instance()[location_d].unused_buffer_list.push_front(tuple); + instance()[location_d].buffer_map.erase(memory_location); + } + } + if (!found) { + throw std::runtime_error("Tried to delete non-existing buffer"); + } } private: From 2896878ed3b7e877984dd422d09f44e799a51a6d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= Date: Fri, 26 May 2023 22:45:32 -0500 Subject: [PATCH 07/46] Add numa aware allocators --- CMakeLists.txt | 9 + include/buffer_manager.hpp | 28 +-- include/hpx_buffer_util.hpp | 94 +++++++++ scripts/configure_build_directory.sh | 4 +- tests/allocator_hpx_test.cpp | 276 +++++++++++++++++++-------- 5 files changed, 307 insertions(+), 104 deletions(-) create mode 100644 include/hpx_buffer_util.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 6362f8eb..3d09eb5a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -90,6 +90,12 @@ endif() ## Interface targets add_library(buffer_manager SHARED src/buffer_manager_definitions.cpp) +if (CPPUDDLE_WITH_HPX) + target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_HAVE_HPX") +endif() +if (CPPUDDLE_WITH_COUNTERS) + target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_HAVE_COUNTERS") +endif() target_include_directories(buffer_manager INTERFACE $ $ @@ -212,6 +218,9 @@ if (CPPUDDLE_WITH_TESTS) if (CPPUDDLE_WITH_COUNTERS) add_compile_definitions(CPPUDDLE_HAVE_COUNTERS) endif() + if (CPPUDDLE_WITH_HPX) + add_compile_definitions(CPPUDDLE_WITH_HPX) + endif() if (CPPUDDLE_WITH_MULTIGPU_SUPPORT) add_compile_definitions(CPPUDDLE_HAVE_MULTIGPU) diff --git a/include/buffer_manager.hpp b/include/buffer_manager.hpp index 7dc2a099..ed22d7b6 100644 --- a/include/buffer_manager.hpp +++ b/include/buffer_manager.hpp @@ -55,24 +55,19 @@ class buffer_recycler { /// Returns and allocated buffer of the requested size - this may be a reused /// buffer template - static T *get(size_t number_elements, bool manage_content_lifetime = false) { + static T *get(size_t number_elements, bool manage_content_lifetime = false, + std::optional location_hint = std::nullopt) { std::lock_guard guard(instance().mut); return buffer_manager::get(number_elements, - manage_content_lifetime); + manage_content_lifetime, location_hint); } /// Marks an buffer as unused and fit for reusage template - static void mark_unused(T *p, size_t number_elements) { + static void mark_unused(T *p, size_t number_elements, + std::optional location_hint = std::nullopt) { std::lock_guard guard(instance().mut); return buffer_manager::mark_unused(p, number_elements); } - /// Increase the reference coutner of a buffer - /* template */ - /* static void increase_usage_counter(T *p, size_t number_elements) noexcept { */ - /* std::lock_guard guard(instance().mut); */ - /* return buffer_manager::increase_usage_counter( */ - /* p, number_elements); */ - /* } */ /// Deallocate all buffers, no matter whether they are marked as used or not static void clean_all() { std::lock_guard guard(instance().mut); @@ -163,9 +158,11 @@ class buffer_recycler { std::optional location_hint = std::nullopt) { init_callbacks_once(); - size_t location_id = 1; - if (location_hint) + size_t location_id = 0; + if (location_hint) { location_id = location_hint.value(); + /* std::cout << " " << location_id; */ + } #ifdef CPPUDDLE_HAVE_COUNTERS @@ -378,10 +375,6 @@ class buffer_recycler { buffer_recycler operator=(buffer_recycler &&other) = delete; }; -/* template */ -/* std::unique_ptr> */ -/* buffer_recycler::buffer_manager::manager_instance{}; */ - template struct recycle_allocator { using value_type = T; recycle_allocator() noexcept = default; @@ -438,9 +431,6 @@ struct aggressive_recycle_allocator { // Do nothing here - Contents will be destroyed when the buffer manager is // destroyed, not before } - /* void increase_usage_counter(T *p, size_t n) { */ - /* buffer_recycler::increase_usage_counter(p, n); */ - /* } */ }; template constexpr bool diff --git a/include/hpx_buffer_util.hpp b/include/hpx_buffer_util.hpp new file mode 100644 index 00000000..549617f6 --- /dev/null +++ b/include/hpx_buffer_util.hpp @@ -0,0 +1,94 @@ +// Copyright (c) 2023 Gregor Daiß +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + + +#ifndef CPPUDDLE_HPX_BUFFER_UTIL_HPP +#define CPPUDDLE_HPX_BUFFER_UTIL_HPP + +#include "buffer_manager.hpp" +#include + +namespace recycler { +namespace detail { + +template struct numa_aware_recycle_allocator { + using value_type = T; + numa_aware_recycle_allocator() noexcept = default; + template + explicit numa_aware_recycle_allocator( + numa_aware_recycle_allocator const &) noexcept {} + T *allocate(std::size_t n) { + T *data = buffer_recycler::get(n, false, hpx::get_worker_thread_num()); + return data; + } + void deallocate(T *p, std::size_t n) { + buffer_recycler::mark_unused(p, n); + } + template + inline void construct(T *p, Args... args) noexcept { + ::new (static_cast(p)) T(std::forward(args)...); + } + void destroy(T *p) { p->~T(); } +}; +template +constexpr bool +operator==(numa_aware_recycle_allocator const &, + numa_aware_recycle_allocator const &) noexcept { + return true; +} +template +constexpr bool +operator!=(numa_aware_recycle_allocator const &, + numa_aware_recycle_allocator const &) noexcept { + return false; +} + +/// Recycles not only allocations but also the contents of a buffer +template +struct numa_aware_aggressive_recycle_allocator { + using value_type = T; + numa_aware_aggressive_recycle_allocator() noexcept = default; + template + explicit numa_aware_aggressive_recycle_allocator( + numa_aware_aggressive_recycle_allocator const &) noexcept {} + T *allocate(std::size_t n) { + T *data = buffer_recycler::get( + n, true, hpx::get_worker_thread_num()); // also initializes the buffer if it isn't reused + return data; + } + void deallocate(T *p, std::size_t n) { + buffer_recycler::mark_unused(p, n); + } + template + inline void construct(T *p, Args... args) noexcept { + // Do nothing here - we reuse the content of the last owner + } + void destroy(T *p) { + // Do nothing here - Contents will be destroyed when the buffer manager is + // destroyed, not before + } +}; +template +constexpr bool +operator==(numa_aware_aggressive_recycle_allocator const &, + numa_aware_aggressive_recycle_allocator const &) noexcept { + return true; +} +template +constexpr bool +operator!=(numa_aware_aggressive_recycle_allocator const &, + numa_aware_aggressive_recycle_allocator const &) noexcept { + return false; +} + +} +template ::value, int> = 0> +using numa_aware_recycle_std = detail::numa_aware_recycle_allocator>; +template ::value, int> = 0> +using numa_aware_aggressive_recycle_std = + detail::numa_aware_aggressive_recycle_allocator>; +} + +#endif diff --git a/scripts/configure_build_directory.sh b/scripts/configure_build_directory.sh index 6ec52e17..177b8b81 100755 --- a/scripts/configure_build_directory.sh +++ b/scripts/configure_build_directory.sh @@ -51,9 +51,9 @@ mkdir -p ${INSTALL_DIR} pushd ${BUILD_DIR} # TODO Reactivate CUDA/KOKKOS once we have a newer cmake version on the test machine if [[ "${CXX}" == "clang++" ]]; then # clang/cmake too old on our usual machine - compile without CUDA - cmake -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DCMAKE_INSTALL_PREFIX=${INSTALL_DIR} -DCMAKE_FIND_PACKAGE_NO_PACKAGE_REGISTRY=ON -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DCPPUDDLE_WITH_TESTS=ON -DCPPUDDLE_WITH_HPX=OFF -DCPPUDDLE_MUTEXLESS_MODE=OFF -DCPPUDDLE_WITH_CUDA=OFF -DCPPUDDLE_WITH_KOKKOS=OFF -DCPPUDDLE_WITH_COUNTERS=ON -DHPX_DIR=${HPX_ROOT} -DKokkos_DIR=${Kokkos_ROOT} -DHPXKokkos_DIR=${HPXKokkos_ROOT} ../.. + cmake -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DCMAKE_INSTALL_PREFIX=${INSTALL_DIR} -DCMAKE_FIND_PACKAGE_NO_PACKAGE_REGISTRY=ON -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DCPPUDDLE_WITH_TESTS=ON -DCPPUDDLE_WITH_HPX=ON -DCPPUDDLE_MUTEXLESS_MODE=OFF -DCPPUDDLE_WITH_CUDA=ON -DCPPUDDLE_WITH_KOKKOS=ON -DCPPUDDLE_WITH_COUNTERS=ON -DHPX_DIR=${HPX_ROOT} -DKokkos_DIR=${Kokkos_ROOT} -DHPXKokkos_DIR=${HPXKokkos_ROOT} ../.. else - cmake -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DCMAKE_INSTALL_PREFIX=${INSTALL_DIR} -DCMAKE_FIND_PACKAGE_NO_PACKAGE_REGISTRY=ON -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DCPPUDDLE_WITH_TESTS=ON -DCPPUDDLE_WITH_HPX=OFF -DCPPUDDLE_MUTEXLESS_MODE=OFF -DCPPUDDLE_WITH_CUDA=OFF -DCPPUDDLE_WITH_KOKKOS=OFF -DCPPUDDLE_WITH_COUNTERS=ON -DHPX_DIR=${HPX_ROOT} -DKokkos_DIR=${Kokkos_ROOT} -DHPXKokkos_DIR=${HPXKokkos_ROOT} ../.. + cmake -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DCMAKE_INSTALL_PREFIX=${INSTALL_DIR} -DCMAKE_FIND_PACKAGE_NO_PACKAGE_REGISTRY=ON -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DCPPUDDLE_WITH_TESTS=ON -DCPPUDDLE_WITH_HPX=ON -DCPPUDDLE_MUTEXLESS_MODE=ON -DCPPUDDLE_WITH_CUDA=ON -DCPPUDDLE_WITH_KOKKOS=ON -DCPPUDDLE_WITH_COUNTERS=ON -DHPX_DIR=${HPX_ROOT} -DKokkos_DIR=${Kokkos_ROOT} -DHPXKokkos_DIR=${HPXKokkos_ROOT} ../.. fi popd cp ${BUILD_DIR}/compile_commands.json ${SCRIPTS_DIR}/../compile_commands.json diff --git a/tests/allocator_hpx_test.cpp b/tests/allocator_hpx_test.cpp index 3bd28066..cd399e09 100644 --- a/tests/allocator_hpx_test.cpp +++ b/tests/allocator_hpx_test.cpp @@ -16,6 +16,7 @@ #include #include "../include/buffer_manager.hpp" +#include "../include/hpx_buffer_util.hpp" int hpx_main(int argc, char *argv[]) { @@ -74,105 +75,214 @@ int hpx_main(int argc, char *argv[]) { assert(number_futures >= 1); // NOLINT assert(number_futures <= max_number_futures); // NOLINT - size_t aggressive_duration = 0; - size_t recycle_duration = 0; - size_t default_duration = 0; - - // ensure that at least 4 buffers have to created for unit testing { - std::vector> buffer1( - array_size, double{}); - std::vector> buffer2( - array_size, double{}); - std::vector> buffer3( - array_size, double{}); - std::vector> buffer4( - array_size, double{}); - } + size_t aggressive_duration = 0; + size_t recycle_duration = 0; + size_t default_duration = 0; - // Aggressive recycle Test: - { - auto begin = std::chrono::high_resolution_clock::now(); - std::vector> futs(max_number_futures); - for (size_t i = 0; i < max_number_futures; i++) { - futs[i] = hpx::make_ready_future(); + // ensure that at least 4 buffers have to created for unit testing + { + std::vector> buffer1( + array_size, double{}); + std::vector> buffer2( + array_size, double{}); + std::vector> buffer3( + array_size, double{}); + std::vector> buffer4( + array_size, double{}); } - for (size_t pass = 0; pass < passes; pass++) { - for (size_t i = 0; i < number_futures; i++) { - futs[i] = futs[i].then([&](hpx::shared_future &&predecessor) { - std::vector> test6( - array_size, double{}); - }); + + // Aggressive recycle Test: + { + auto begin = std::chrono::high_resolution_clock::now(); + std::vector> futs(max_number_futures); + for (size_t i = 0; i < max_number_futures; i++) { + futs[i] = hpx::make_ready_future(); + } + for (size_t pass = 0; pass < passes; pass++) { + for (size_t i = 0; i < number_futures; i++) { + futs[i] = futs[i].then([&](hpx::shared_future &&predecessor) { + std::vector> test6( + array_size, double{}); + }); + } } + auto when = hpx::when_all(futs); + when.wait(); + auto end = std::chrono::high_resolution_clock::now(); + aggressive_duration = + std::chrono::duration_cast(end - begin) + .count(); + std::cout << "\n==> NUMA-aware aggressive recycle allocation test took " + << aggressive_duration << "ms" << std::endl; } - auto when = hpx::when_all(futs); - when.wait(); - auto end = std::chrono::high_resolution_clock::now(); - aggressive_duration = - std::chrono::duration_cast(end - begin) - .count(); - std::cout << "\n==> Aggressive recycle allocation test took " - << aggressive_duration << "ms" << std::endl; - } + recycler::force_cleanup(); // Cleanup all buffers and the managers for better + // comparison - { - auto begin = std::chrono::high_resolution_clock::now(); - std::vector> futs(max_number_futures); - for (size_t i = 0; i < max_number_futures; i++) { - futs[i] = hpx::make_ready_future(); + { + auto begin = std::chrono::high_resolution_clock::now(); + std::vector> futs(max_number_futures); + for (size_t i = 0; i < max_number_futures; i++) { + futs[i] = hpx::make_ready_future(); + } + for (size_t pass = 0; pass < passes; pass++) { + for (size_t i = 0; i < number_futures; i++) { + futs[i] = futs[i].then([&](hpx::shared_future &&predecessor) { + std::vector> test6(array_size, + double{}); + }); + } + } + auto when = hpx::when_all(futs); + when.wait(); + auto end = std::chrono::high_resolution_clock::now(); + recycle_duration = + std::chrono::duration_cast(end - begin) + .count(); + std::cout << "\n==> NUMA-aware recycle allocation test took " << recycle_duration + << "ms" << std::endl; } - for (size_t pass = 0; pass < passes; pass++) { - for (size_t i = 0; i < number_futures; i++) { - futs[i] = futs[i].then([&](hpx::shared_future &&predecessor) { - std::vector> test6(array_size, - double{}); - }); + recycler::force_cleanup(); // Cleanup all buffers and the managers for better + // comparison + + // Same test using std::allocator: + { + auto begin = std::chrono::high_resolution_clock::now(); + std::vector> futs(max_number_futures); + for (size_t i = 0; i < max_number_futures; i++) { + futs[i] = hpx::make_ready_future(); + } + for (size_t pass = 0; pass < passes; pass++) { + for (size_t i = 0; i < number_futures; i++) { + futs[i] = futs[i].then([&](hpx::shared_future &&predecessor) { + std::vector test6(array_size, double{}); + }); + } } + auto when = hpx::when_all(futs); + when.wait(); + auto end = std::chrono::high_resolution_clock::now(); + default_duration = + std::chrono::duration_cast(end - begin) + .count(); + std::cout << "\n==> Non-recycle allocation test took " << default_duration + << "ms" << std::endl; + } + + if (aggressive_duration < recycle_duration) { + std::cout << "Test information: NUMA-aware aggressive recycler was faster than normal " + "recycler!" + << std::endl; + } + if (recycle_duration < default_duration) { + std::cout << "Test information: NUMA-aware recycler was faster than default allocator!" + << std::endl; } - auto when = hpx::when_all(futs); - when.wait(); - auto end = std::chrono::high_resolution_clock::now(); - recycle_duration = - std::chrono::duration_cast(end - begin) - .count(); - std::cout << "\n==> Recycle allocation test took " << recycle_duration - << "ms" << std::endl; } - recycler::force_cleanup(); // Cleanup all buffers and the managers for better - // comparison - // Same test using std::allocator: { - auto begin = std::chrono::high_resolution_clock::now(); - std::vector> futs(max_number_futures); - for (size_t i = 0; i < max_number_futures; i++) { - futs[i] = hpx::make_ready_future(); + size_t aggressive_duration = 0; + size_t recycle_duration = 0; + size_t default_duration = 0; + + // ensure that at least 4 buffers have to created for unit testing + { + std::vector> buffer1( + array_size, double{}); + std::vector> buffer2( + array_size, double{}); + std::vector> buffer3( + array_size, double{}); + std::vector> buffer4( + array_size, double{}); } - for (size_t pass = 0; pass < passes; pass++) { - for (size_t i = 0; i < number_futures; i++) { - futs[i] = futs[i].then([&](hpx::shared_future &&predecessor) { - std::vector test6(array_size, double{}); - }); + + // Aggressive recycle Test: + { + auto begin = std::chrono::high_resolution_clock::now(); + std::vector> futs(max_number_futures); + for (size_t i = 0; i < max_number_futures; i++) { + futs[i] = hpx::make_ready_future(); + } + for (size_t pass = 0; pass < passes; pass++) { + for (size_t i = 0; i < number_futures; i++) { + futs[i] = futs[i].then([&](hpx::shared_future &&predecessor) { + std::vector> test6( + array_size, double{}); + }); + } } + auto when = hpx::when_all(futs); + when.wait(); + auto end = std::chrono::high_resolution_clock::now(); + aggressive_duration = + std::chrono::duration_cast(end - begin) + .count(); + std::cout << "\n==> Aggressive recycle allocation test took " + << aggressive_duration << "ms" << std::endl; } - auto when = hpx::when_all(futs); - when.wait(); - auto end = std::chrono::high_resolution_clock::now(); - default_duration = - std::chrono::duration_cast(end - begin) - .count(); - std::cout << "\n==> Non-recycle allocation test took " << default_duration - << "ms" << std::endl; - } + recycler::force_cleanup(); // Cleanup all buffers and the managers for better + // comparison - if (aggressive_duration < recycle_duration) { - std::cout << "Test information: Aggressive recycler was faster than normal " - "recycler!" - << std::endl; - } - if (recycle_duration < default_duration) { - std::cout << "Test information: Recycler was faster than default allocator!" - << std::endl; + { + auto begin = std::chrono::high_resolution_clock::now(); + std::vector> futs(max_number_futures); + for (size_t i = 0; i < max_number_futures; i++) { + futs[i] = hpx::make_ready_future(); + } + for (size_t pass = 0; pass < passes; pass++) { + for (size_t i = 0; i < number_futures; i++) { + futs[i] = futs[i].then([&](hpx::shared_future &&predecessor) { + std::vector> test6(array_size, + double{}); + }); + } + } + auto when = hpx::when_all(futs); + when.wait(); + auto end = std::chrono::high_resolution_clock::now(); + recycle_duration = + std::chrono::duration_cast(end - begin) + .count(); + std::cout << "\n==> Recycle allocation test took " << recycle_duration + << "ms" << std::endl; + } + recycler::force_cleanup(); // Cleanup all buffers and the managers for better + // comparison + + // Same test using std::allocator: + { + auto begin = std::chrono::high_resolution_clock::now(); + std::vector> futs(max_number_futures); + for (size_t i = 0; i < max_number_futures; i++) { + futs[i] = hpx::make_ready_future(); + } + for (size_t pass = 0; pass < passes; pass++) { + for (size_t i = 0; i < number_futures; i++) { + futs[i] = futs[i].then([&](hpx::shared_future &&predecessor) { + std::vector test6(array_size, double{}); + }); + } + } + auto when = hpx::when_all(futs); + when.wait(); + auto end = std::chrono::high_resolution_clock::now(); + default_duration = + std::chrono::duration_cast(end - begin) + .count(); + std::cout << "\n==> Non-recycle allocation test took " << default_duration + << "ms" << std::endl; + } + + if (aggressive_duration < recycle_duration) { + std::cout << "Test information: Aggressive recycler was faster than normal " + "recycler!" + << std::endl; + } + if (recycle_duration < default_duration) { + std::cout << "Test information: Recycler was faster than default allocator!" + << std::endl; + } } return hpx::finalize(); } From 3a08787bc9aa2b6aaa0e6ce3d62098ddf5f2a386 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= Date: Thu, 1 Jun 2023 11:57:10 -0500 Subject: [PATCH 08/46] Drop global mutex --- CMakeLists.txt | 2 +- include/buffer_manager.hpp | 37 ++++++++++++++++++------------------- 2 files changed, 19 insertions(+), 20 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 389b8866..c1eb2b4e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -353,7 +353,7 @@ if (CPPUDDLE_WITH_TESTS) add_test(allocator_concurrency_test.analyse_recycle_rate cat allocator_concurrency_test.out) set_tests_properties(allocator_concurrency_test.analyse_recycle_rate PROPERTIES FIXTURES_REQUIRED allocator_concurrency_output - PASS_REGULAR_EXPRESSION "==> Recycle rate: [ ]* 99.844%" + PASS_REGULAR_EXPRESSION "==> Recycle rate: [ ]* 99.6885%" ) add_test(allocator_concurrency_test.analyse_marked_buffers_cleanup cat allocator_concurrency_test.out) set_tests_properties(allocator_concurrency_test.analyse_marked_buffers_cleanup PROPERTIES diff --git a/include/buffer_manager.hpp b/include/buffer_manager.hpp index ed22d7b6..097bc3ec 100644 --- a/include/buffer_manager.hpp +++ b/include/buffer_manager.hpp @@ -23,7 +23,7 @@ namespace recycler { -constexpr size_t number_instances = 4; +constexpr size_t number_instances = 128; namespace detail { namespace util { @@ -57,7 +57,6 @@ class buffer_recycler { template static T *get(size_t number_elements, bool manage_content_lifetime = false, std::optional location_hint = std::nullopt) { - std::lock_guard guard(instance().mut); return buffer_manager::get(number_elements, manage_content_lifetime, location_hint); } @@ -65,12 +64,10 @@ class buffer_recycler { template static void mark_unused(T *p, size_t number_elements, std::optional location_hint = std::nullopt) { - std::lock_guard guard(instance().mut); return buffer_manager::mark_unused(p, number_elements); } /// Deallocate all buffers, no matter whether they are marked as used or not static void clean_all() { - std::lock_guard guard(instance().mut); for (const auto &clean_function : instance().total_cleanup_callbacks) { clean_function(); @@ -78,7 +75,6 @@ class buffer_recycler { } /// Deallocated all currently unused buffer static void clean_unused_buffers() { - std::lock_guard guard(instance().mut); for (const auto &clean_function : instance().partial_cleanup_callbacks) { clean_function(); @@ -99,10 +95,6 @@ class buffer_recycler { /// Callbacks for partial buffer_manager cleanups - each callback deallocates /// all unused buffers of a manager std::list> partial_cleanup_callbacks; - /// One Mutex to control concurrent access - Since we do not actually ever - /// return the singleton instance anywhere, this should hopefully suffice We - /// want more fine-grained concurrent access eventually - std::mutex mut; /// default, private constructor - not automatically constructed due to the /// deleted constructors buffer_recycler() = default; @@ -142,6 +134,7 @@ class buffer_recycler { /// Cleanup all buffers not currently in use static void clean_unused_buffers_only() { for (auto i = 0; i < number_instances; i++) { + std::lock_guard guard(instance()[i].mut); for (auto &buffer_tuple : instance()[i].unused_buffer_list) { Host_Allocator alloc; if (std::get<3>(buffer_tuple)) { @@ -161,8 +154,8 @@ class buffer_recycler { size_t location_id = 0; if (location_hint) { location_id = location_hint.value(); - /* std::cout << " " << location_id; */ } + std::lock_guard guard(instance()[location_id].mut); #ifdef CPPUDDLE_HAVE_COUNTERS @@ -174,7 +167,6 @@ class buffer_recycler { auto tuple = *iter; if (std::get<1>(tuple) == number_of_elements) { instance()[location_id].unused_buffer_list.erase(iter); - /* std::get<2>(tuple)++; // increase usage counter to 1 */ // handle the switch from aggressive to non aggressive reusage (or // vice-versa) @@ -240,21 +232,22 @@ class buffer_recycler { } bool found = false; - for(size_t location_d = locations_start; location_d < locations_end; location_d++) { - if (instance()[location_d].buffer_map.find(memory_location) != - instance()[location_d].buffer_map.end()) { + for(size_t location_id = locations_start; location_id < locations_end; location_id++) { + std::lock_guard guard(instance()[location_id].mut); + if (instance()[location_id].buffer_map.find(memory_location) != + instance()[location_id].buffer_map.end()) { found = true; #ifdef CPPUDDLE_HAVE_COUNTERS - instance()[location_d].number_dealloacation++; + instance()[location_id].number_dealloacation++; #endif - auto it = instance()[location_d].buffer_map.find(memory_location); - assert(it != instance()[location_d].buffer_map.end()); + auto it = instance()[location_id].buffer_map.find(memory_location); + assert(it != instance()[location_id].buffer_map.end()); auto &tuple = it->second; // sanity checks: assert(std::get<1>(tuple) == number_of_elements); // move to the unused_buffer list - instance()[location_d].unused_buffer_list.push_front(tuple); - instance()[location_d].buffer_map.erase(memory_location); + instance()[location_id].unused_buffer_list.push_front(tuple); + instance()[location_id].buffer_map.erase(memory_location); } } if (!found) { @@ -267,6 +260,8 @@ class buffer_recycler { std::unordered_map buffer_map{}; /// List with all buffers currently not used std::list unused_buffer_list{}; + /// Access control + std::mutex mut; #ifdef CPPUDDLE_HAVE_COUNTERS /// Performance counters size_t number_allocation{0}, number_dealloacation{0}; @@ -299,6 +294,10 @@ class buffer_recycler { public: ~buffer_manager() { + // All operations should have finished before this is happening + // Should be fine when throwing as there's no real point in recovering at that stage + std::lock_guard guard(mut); + if (number_allocation == 0 && number_recycling == 0 && number_bad_alloc == 0 && number_creation == 0 && unused_buffer_list.empty() && buffer_map.empty()) { From 0d08d73dc331b451f0810067d3958f9d4da84015 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= Date: Fri, 2 Jun 2023 09:35:31 -0500 Subject: [PATCH 09/46] Test dealloc hints --- include/buffer_manager.hpp | 48 ++++++++++++++++++++++++++++--------- include/hpx_buffer_util.hpp | 12 ++++++---- 2 files changed, 45 insertions(+), 15 deletions(-) diff --git a/include/buffer_manager.hpp b/include/buffer_manager.hpp index 097bc3ec..ecd8c06a 100644 --- a/include/buffer_manager.hpp +++ b/include/buffer_manager.hpp @@ -224,19 +224,41 @@ class buffer_recycler { static void mark_unused(T *memory_location, size_t number_of_elements, std::optional location_hint = std::nullopt) { - size_t locations_start = 0; - size_t locations_end = number_instances; + if (location_hint) { - locations_start = location_hint.value(); - locations_end = location_hint.value() + 1; + size_t location_id = location_hint.value(); + std::lock_guard guard(instance()[location_id].mut); + if (instance()[location_id].buffer_map.find(memory_location) != + instance()[location_id].buffer_map.end()) { +#ifdef CPPUDDLE_HAVE_COUNTERS + instance()[location_id].number_dealloacation++; +#endif + auto it = instance()[location_id].buffer_map.find(memory_location); + assert(it != instance()[location_id].buffer_map.end()); + auto &tuple = it->second; + // sanity checks: + assert(std::get<1>(tuple) == number_of_elements); + // move to the unused_buffer list + instance()[location_id].unused_buffer_list.push_front(tuple); + instance()[location_id].buffer_map.erase(memory_location); + return; // Success + } + // hint was wrong - note that, and continue on with all other buffer + // managers +#ifdef CPPUDDLE_HAVE_COUNTERS + instance()[location_id].number_wrong_hints++; +#endif } - bool found = false; - for(size_t location_id = locations_start; location_id < locations_end; location_id++) { + for(size_t location_id = 0; location_id < number_instances; location_id++) { + if (location_hint) { + if (location_hint.value() == location_id) { + continue; // already tried this -> skip + } + } std::lock_guard guard(instance()[location_id].mut); if (instance()[location_id].buffer_map.find(memory_location) != instance()[location_id].buffer_map.end()) { - found = true; #ifdef CPPUDDLE_HAVE_COUNTERS instance()[location_id].number_dealloacation++; #endif @@ -248,11 +270,12 @@ class buffer_recycler { // move to the unused_buffer list instance()[location_id].unused_buffer_list.push_front(tuple); instance()[location_id].buffer_map.erase(memory_location); + return; // Success } } - if (!found) { - throw std::runtime_error("Tried to delete non-existing buffer"); - } + + // Failure -- something is very wrong + throw std::runtime_error("Tried to delete non-existing buffer"); } private: @@ -264,7 +287,7 @@ class buffer_recycler { std::mutex mut; #ifdef CPPUDDLE_HAVE_COUNTERS /// Performance counters - size_t number_allocation{0}, number_dealloacation{0}; + size_t number_allocation{0}, number_dealloacation{0}, number_wrong_hints{0}; size_t number_recycling{0}, number_creation{0}, number_bad_alloc{0}; #endif /// Singleton instance @@ -342,6 +365,9 @@ class buffer_recycler { << "--> Number cleaned up buffers: " " " << number_cleaned << std::endl + << "--> Number wrong deallocation hints: " + " " + << number_wrong_hints << std::endl << "--> Number of buffers that were marked as used upon " "cleanup: " << buffer_map.size() << std::endl diff --git a/include/hpx_buffer_util.hpp b/include/hpx_buffer_util.hpp index 549617f6..094e0cb5 100644 --- a/include/hpx_buffer_util.hpp +++ b/include/hpx_buffer_util.hpp @@ -16,15 +16,17 @@ namespace detail { template struct numa_aware_recycle_allocator { using value_type = T; numa_aware_recycle_allocator() noexcept = default; + size_t dealloc_hint{0}; template explicit numa_aware_recycle_allocator( numa_aware_recycle_allocator const &) noexcept {} T *allocate(std::size_t n) { - T *data = buffer_recycler::get(n, false, hpx::get_worker_thread_num()); + dealloc_hint = hpx::get_worker_thread_num(); + T *data = buffer_recycler::get(n, false, dealloc_hint); return data; } void deallocate(T *p, std::size_t n) { - buffer_recycler::mark_unused(p, n); + buffer_recycler::mark_unused(p, n, dealloc_hint); } template inline void construct(T *p, Args... args) noexcept { @@ -50,16 +52,18 @@ template struct numa_aware_aggressive_recycle_allocator { using value_type = T; numa_aware_aggressive_recycle_allocator() noexcept = default; + size_t dealloc_hint{0}; template explicit numa_aware_aggressive_recycle_allocator( numa_aware_aggressive_recycle_allocator const &) noexcept {} T *allocate(std::size_t n) { + dealloc_hint = hpx::get_worker_thread_num(); T *data = buffer_recycler::get( - n, true, hpx::get_worker_thread_num()); // also initializes the buffer if it isn't reused + n, true, dealloc_hint); // also initializes the buffer if it isn't reused return data; } void deallocate(T *p, std::size_t n) { - buffer_recycler::mark_unused(p, n); + buffer_recycler::mark_unused(p, n, dealloc_hint); } template inline void construct(T *p, Args... args) noexcept { From 98763d0b93c667ea8daa80c8d235a5e7e2bde24e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= Date: Fri, 2 Jun 2023 10:41:30 -0500 Subject: [PATCH 10/46] Use different default hint --- include/hpx_buffer_util.hpp | 52 +++++++++++++++++++++++++------------ 1 file changed, 35 insertions(+), 17 deletions(-) diff --git a/include/hpx_buffer_util.hpp b/include/hpx_buffer_util.hpp index 094e0cb5..71f2f98e 100644 --- a/include/hpx_buffer_util.hpp +++ b/include/hpx_buffer_util.hpp @@ -7,22 +7,26 @@ #ifndef CPPUDDLE_HPX_BUFFER_UTIL_HPP #define CPPUDDLE_HPX_BUFFER_UTIL_HPP +#include #include "buffer_manager.hpp" #include namespace recycler { namespace detail { -template struct numa_aware_recycle_allocator { +template + struct numa_aware_recycle_allocator { using value_type = T; - numa_aware_recycle_allocator() noexcept = default; - size_t dealloc_hint{0}; - template + const std::optional dealloc_hint; + numa_aware_recycle_allocator() noexcept + : dealloc_hint(hpx::get_worker_thread_num()) {} + explicit numa_aware_recycle_allocator(size_t hint) noexcept + : dealloc_hint(hint) {} explicit numa_aware_recycle_allocator( - numa_aware_recycle_allocator const &) noexcept {} + numa_aware_recycle_allocator const &) noexcept {} T *allocate(std::size_t n) { - dealloc_hint = hpx::get_worker_thread_num(); - T *data = buffer_recycler::get(n, false, dealloc_hint); + T *data = buffer_recycler::get( + n, false, hpx::get_worker_thread_num()); return data; } void deallocate(T *p, std::size_t n) { @@ -38,28 +42,36 @@ template constexpr bool operator==(numa_aware_recycle_allocator const &, numa_aware_recycle_allocator const &) noexcept { - return true; + if constexpr (std::is_same_v) + return true; + else + return false; } template constexpr bool operator!=(numa_aware_recycle_allocator const &, numa_aware_recycle_allocator const &) noexcept { - return false; + if constexpr (std::is_same_v) + return false; + else + return true; } /// Recycles not only allocations but also the contents of a buffer template struct numa_aware_aggressive_recycle_allocator { using value_type = T; - numa_aware_aggressive_recycle_allocator() noexcept = default; - size_t dealloc_hint{0}; - template + std::optional dealloc_hint; + numa_aware_aggressive_recycle_allocator() noexcept + : dealloc_hint(hpx::get_worker_thread_num()) {} + explicit numa_aware_aggressive_recycle_allocator(size_t hint) noexcept + : dealloc_hint(hint) {} explicit numa_aware_aggressive_recycle_allocator( - numa_aware_aggressive_recycle_allocator const &) noexcept {} + numa_aware_recycle_allocator const &) noexcept {} T *allocate(std::size_t n) { - dealloc_hint = hpx::get_worker_thread_num(); T *data = buffer_recycler::get( - n, true, dealloc_hint); // also initializes the buffer if it isn't reused + n, true, hpx::get_worker_thread_num()); // also initializes the buffer + // if it isn't reused return data; } void deallocate(T *p, std::size_t n) { @@ -78,13 +90,19 @@ template constexpr bool operator==(numa_aware_aggressive_recycle_allocator const &, numa_aware_aggressive_recycle_allocator const &) noexcept { - return true; + if constexpr (std::is_same_v) + return true; + else + return false; } template constexpr bool operator!=(numa_aware_aggressive_recycle_allocator const &, numa_aware_aggressive_recycle_allocator const &) noexcept { - return false; + if constexpr (std::is_same_v) + return false; + else + return true; } } From a46ea10b130255e27c89bf24371b47d49a9e622d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= Date: Fri, 2 Jun 2023 10:58:22 -0500 Subject: [PATCH 11/46] Change test order --- include/hpx_buffer_util.hpp | 1 - tests/allocator_hpx_test.cpp | 107 ++++++++++++++++++----------------- 2 files changed, 56 insertions(+), 52 deletions(-) diff --git a/include/hpx_buffer_util.hpp b/include/hpx_buffer_util.hpp index 71f2f98e..54c52d1f 100644 --- a/include/hpx_buffer_util.hpp +++ b/include/hpx_buffer_util.hpp @@ -7,7 +7,6 @@ #ifndef CPPUDDLE_HPX_BUFFER_UTIL_HPP #define CPPUDDLE_HPX_BUFFER_UTIL_HPP -#include #include "buffer_manager.hpp" #include diff --git a/tests/allocator_hpx_test.cpp b/tests/allocator_hpx_test.cpp index cd399e09..8dae8d7b 100644 --- a/tests/allocator_hpx_test.cpp +++ b/tests/allocator_hpx_test.cpp @@ -20,7 +20,7 @@ int hpx_main(int argc, char *argv[]) { - constexpr size_t max_number_futures = 64; + constexpr size_t max_number_futures = 1024; size_t number_futures = 64; size_t array_size = 500000; size_t passes = 200; @@ -80,19 +80,7 @@ int hpx_main(int argc, char *argv[]) { size_t recycle_duration = 0; size_t default_duration = 0; - // ensure that at least 4 buffers have to created for unit testing - { - std::vector> buffer1( - array_size, double{}); - std::vector> buffer2( - array_size, double{}); - std::vector> buffer3( - array_size, double{}); - std::vector> buffer4( - array_size, double{}); - } - - // Aggressive recycle Test: + // test using std::allocator: { auto begin = std::chrono::high_resolution_clock::now(); std::vector> futs(max_number_futures); @@ -102,23 +90,21 @@ int hpx_main(int argc, char *argv[]) { for (size_t pass = 0; pass < passes; pass++) { for (size_t i = 0; i < number_futures; i++) { futs[i] = futs[i].then([&](hpx::shared_future &&predecessor) { - std::vector> test6( - array_size, double{}); + std::vector test6(array_size, double{}); }); } } auto when = hpx::when_all(futs); when.wait(); auto end = std::chrono::high_resolution_clock::now(); - aggressive_duration = + default_duration = std::chrono::duration_cast(end - begin) .count(); - std::cout << "\n==> NUMA-aware aggressive recycle allocation test took " - << aggressive_duration << "ms" << std::endl; + std::cout << "\n==> Non-recycle allocation test took " << default_duration + << "ms" << std::endl; } - recycler::force_cleanup(); // Cleanup all buffers and the managers for better - // comparison + // test using normal recycle allocator { auto begin = std::chrono::high_resolution_clock::now(); std::vector> futs(max_number_futures); @@ -145,7 +131,19 @@ int hpx_main(int argc, char *argv[]) { recycler::force_cleanup(); // Cleanup all buffers and the managers for better // comparison - // Same test using std::allocator: + // ensure that at least 4 buffers have to created for unit testing + { + std::vector> buffer1( + array_size, double{}); + std::vector> buffer2( + array_size, double{}); + std::vector> buffer3( + array_size, double{}); + std::vector> buffer4( + array_size, double{}); + } + + // Aggressive recycle Test: { auto begin = std::chrono::high_resolution_clock::now(); std::vector> futs(max_number_futures); @@ -155,19 +153,24 @@ int hpx_main(int argc, char *argv[]) { for (size_t pass = 0; pass < passes; pass++) { for (size_t i = 0; i < number_futures; i++) { futs[i] = futs[i].then([&](hpx::shared_future &&predecessor) { - std::vector test6(array_size, double{}); + std::vector> test6( + array_size, double{}); }); } } auto when = hpx::when_all(futs); when.wait(); auto end = std::chrono::high_resolution_clock::now(); - default_duration = + aggressive_duration = std::chrono::duration_cast(end - begin) .count(); - std::cout << "\n==> Non-recycle allocation test took " << default_duration - << "ms" << std::endl; + std::cout << "\n==> NUMA-aware aggressive recycle allocation test took " + << aggressive_duration << "ms" << std::endl; } + recycler::force_cleanup(); // Cleanup all buffers and the managers for better + // comparison + + if (aggressive_duration < recycle_duration) { std::cout << "Test information: NUMA-aware aggressive recycler was faster than normal " @@ -185,19 +188,7 @@ int hpx_main(int argc, char *argv[]) { size_t recycle_duration = 0; size_t default_duration = 0; - // ensure that at least 4 buffers have to created for unit testing - { - std::vector> buffer1( - array_size, double{}); - std::vector> buffer2( - array_size, double{}); - std::vector> buffer3( - array_size, double{}); - std::vector> buffer4( - array_size, double{}); - } - - // Aggressive recycle Test: + // Same test using std::allocator: { auto begin = std::chrono::high_resolution_clock::now(); std::vector> futs(max_number_futures); @@ -207,22 +198,19 @@ int hpx_main(int argc, char *argv[]) { for (size_t pass = 0; pass < passes; pass++) { for (size_t i = 0; i < number_futures; i++) { futs[i] = futs[i].then([&](hpx::shared_future &&predecessor) { - std::vector> test6( - array_size, double{}); + std::vector test6(array_size, double{}); }); } } auto when = hpx::when_all(futs); when.wait(); auto end = std::chrono::high_resolution_clock::now(); - aggressive_duration = + default_duration = std::chrono::duration_cast(end - begin) .count(); - std::cout << "\n==> Aggressive recycle allocation test took " - << aggressive_duration << "ms" << std::endl; + std::cout << "\n==> Non-recycle allocation test took " << default_duration + << "ms" << std::endl; } - recycler::force_cleanup(); // Cleanup all buffers and the managers for better - // comparison { auto begin = std::chrono::high_resolution_clock::now(); @@ -250,7 +238,20 @@ int hpx_main(int argc, char *argv[]) { recycler::force_cleanup(); // Cleanup all buffers and the managers for better // comparison - // Same test using std::allocator: + + // ensure that at least 4 buffers have to created for unit testing + { + std::vector> buffer1( + array_size, double{}); + std::vector> buffer2( + array_size, double{}); + std::vector> buffer3( + array_size, double{}); + std::vector> buffer4( + array_size, double{}); + } + + // Aggressive recycle Test: { auto begin = std::chrono::high_resolution_clock::now(); std::vector> futs(max_number_futures); @@ -260,19 +261,23 @@ int hpx_main(int argc, char *argv[]) { for (size_t pass = 0; pass < passes; pass++) { for (size_t i = 0; i < number_futures; i++) { futs[i] = futs[i].then([&](hpx::shared_future &&predecessor) { - std::vector test6(array_size, double{}); + std::vector> test6( + array_size, double{}); }); } } auto when = hpx::when_all(futs); when.wait(); auto end = std::chrono::high_resolution_clock::now(); - default_duration = + aggressive_duration = std::chrono::duration_cast(end - begin) .count(); - std::cout << "\n==> Non-recycle allocation test took " << default_duration - << "ms" << std::endl; + std::cout << "\n==> Aggressive recycle allocation test took " + << aggressive_duration << "ms" << std::endl; } + recycler::force_cleanup(); // Cleanup all buffers and the managers for better + // comparison + if (aggressive_duration < recycle_duration) { std::cout << "Test information: Aggressive recycler was faster than normal " From 00a6546aa7d1d387b28f793e3581e54a35045534 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= Date: Fri, 2 Jun 2023 11:57:13 -0500 Subject: [PATCH 12/46] Replace C++14 utils with C++17 std --- include/buffer_manager.hpp | 60 +++++++++++++++----------------------- 1 file changed, 24 insertions(+), 36 deletions(-) diff --git a/include/buffer_manager.hpp b/include/buffer_manager.hpp index ecd8c06a..776829fd 100644 --- a/include/buffer_manager.hpp +++ b/include/buffer_manager.hpp @@ -26,29 +26,6 @@ namespace recycler { constexpr size_t number_instances = 128; namespace detail { -namespace util { -/// Helper methods for C++14 - this is obsolete for c++17 and only meant as a -/// temporary crutch -template -void uninitialized_value_construct_n(ForwardIt first, Size n) { - using Value = typename std::iterator_traits::value_type; - ForwardIt current = first; - for (; n > 0; (void)++current, --n) { - ::new (static_cast(std::addressof(*current))) Value(); - } -} -/// Helper methods for C++14 - this is obsolete for c++17 and only meant as a -/// temporary crutch -template -void destroy_n(ForwardIt first, Size n) { - using Value = typename std::iterator_traits::value_type; - ForwardIt current = first; - for (; n > 0; (void)++current, --n) { - current->~Value(); - } -} -} // namespace util - class buffer_recycler { // Public interface public: @@ -138,7 +115,7 @@ class buffer_recycler { for (auto &buffer_tuple : instance()[i].unused_buffer_list) { Host_Allocator alloc; if (std::get<3>(buffer_tuple)) { - util::destroy_n(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple)); + std::destroy_n(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple)); } alloc.deallocate(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple)); } @@ -171,11 +148,11 @@ class buffer_recycler { // handle the switch from aggressive to non aggressive reusage (or // vice-versa) if (manage_content_lifetime && !std::get<3>(tuple)) { - util::uninitialized_value_construct_n(std::get<0>(tuple), + std::uninitialized_value_construct_n(std::get<0>(tuple), number_of_elements); std::get<3>(tuple) = true; } else if (!manage_content_lifetime && std::get<3>(tuple)) { - util::destroy_n(std::get<0>(tuple), std::get<1>(tuple)); + std::destroy_n(std::get<0>(tuple), std::get<1>(tuple)); std::get<3>(tuple) = false; } instance()[location_id].buffer_map.insert({std::get<0>(tuple), tuple}); @@ -197,7 +174,7 @@ class buffer_recycler { instance()[location_id].number_creation++; #endif if (manage_content_lifetime) { - util::uninitialized_value_construct_n(buffer, number_of_elements); + std::uninitialized_value_construct_n(buffer, number_of_elements); } return buffer; } catch (std::bad_alloc &e) { @@ -216,7 +193,7 @@ class buffer_recycler { instance()[location_id].number_bad_alloc++; #endif if (manage_content_lifetime) { - util::uninitialized_value_construct_n(buffer, number_of_elements); + std::uninitialized_value_construct_n(buffer, number_of_elements); } return buffer; } @@ -329,7 +306,7 @@ class buffer_recycler { for (auto &buffer_tuple : unused_buffer_list) { Host_Allocator alloc; if (std::get<3>(buffer_tuple)) { - util::destroy_n(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple)); + std::destroy_n(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple)); } alloc.deallocate(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple)); } @@ -337,7 +314,7 @@ class buffer_recycler { auto buffer_tuple = map_tuple.second; Host_Allocator alloc; if (std::get<3>(buffer_tuple)) { - util::destroy_n(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple)); + std::destroy_n(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple)); } alloc.deallocate(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple)); } @@ -366,7 +343,7 @@ class buffer_recycler { " " << number_cleaned << std::endl << "--> Number wrong deallocation hints: " - " " + " " << number_wrong_hints << std::endl << "--> Number of buffers that were marked as used upon " "cleanup: " @@ -376,7 +353,6 @@ class buffer_recycler { << static_cast(number_recycling) / number_allocation * 100.0f << "%" << std::endl; - // assert(buffer_map.size() == 0); // Were there any buffers still used? #endif unused_buffer_list.clear(); buffer_map.clear(); @@ -423,13 +399,19 @@ template constexpr bool operator==(recycle_allocator const &, recycle_allocator const &) noexcept { - return true; + if constexpr (std::is_same_v) + return true; + else + return false; } template constexpr bool operator!=(recycle_allocator const &, recycle_allocator const &) noexcept { - return false; + if constexpr (std::is_same_v) + return false; + else + return true; } /// Recycles not only allocations but also the contents of a buffer @@ -461,13 +443,19 @@ template constexpr bool operator==(aggressive_recycle_allocator const &, aggressive_recycle_allocator const &) noexcept { - return true; + if constexpr (std::is_same_v) + return true; + else + return false; } template constexpr bool operator!=(aggressive_recycle_allocator const &, aggressive_recycle_allocator const &) noexcept { - return false; + if constexpr (std::is_same_v) + return false; + else + return true; } } // namespace detail From c2e297c00161f4b8988aee7a564e54a217928444 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= Date: Fri, 2 Jun 2023 12:25:09 -0500 Subject: [PATCH 13/46] Deleted buffer manager src --- src/buffer_manager_definitions.cpp | 11 ----------- 1 file changed, 11 deletions(-) delete mode 100644 src/buffer_manager_definitions.cpp diff --git a/src/buffer_manager_definitions.cpp b/src/buffer_manager_definitions.cpp deleted file mode 100644 index 34e635b0..00000000 --- a/src/buffer_manager_definitions.cpp +++ /dev/null @@ -1,11 +0,0 @@ -// Copyright (c) 2020-2021 Gregor Daiß -// -// Distributed under the Boost Software License, Version 1.0. (See accompanying -// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) - -//#include "../include/buffer_manager.hpp" - -// Instance defintions -/* std::unique_ptr */ -/* recycler::detail::buffer_recycler::recycler_instance{}; */ -/* std::mutex recycler::detail::buffer_recycler::mut{}; */ From 8c9dfe0c0dd65521c4ca05261235081e5a1a35c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= Date: Fri, 2 Jun 2023 13:20:30 -0500 Subject: [PATCH 14/46] Make stream manager src file irrelevant --- CMakeLists.txt | 14 ++++++++------ include/stream_manager.hpp | 22 ++++------------------ src/stream_manager_definitions.cpp | 10 ---------- 3 files changed, 12 insertions(+), 34 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c1eb2b4e..dc32ac10 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -88,7 +88,7 @@ endif() ## Interface targets -add_library(buffer_manager SHARED src/buffer_manager_definitions.cpp) +add_library(buffer_manager INTERFACE) if (CPPUDDLE_WITH_HPX) target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_HAVE_HPX") endif() @@ -100,9 +100,13 @@ target_include_directories(buffer_manager INTERFACE $ ) -add_library(stream_manager SHARED src/stream_manager_definitions.cpp) -target_link_libraries(stream_manager - PRIVATE ) +add_library(stream_manager INTERFACE) +if (CPPUDDLE_WITH_HPX) + target_compile_definitions(stream_manager INTERFACE "CPPUDDLE_HAVE_HPX") +endif() +if (CPPUDDLE_WITH_COUNTERS) + target_compile_definitions(stream_manager INTERFACE "CPPUDDLE_HAVE_COUNTERS") +endif() target_include_directories(stream_manager INTERFACE $ $ @@ -110,10 +114,8 @@ $ # install libs with the defitions: install(TARGETS buffer_manager EXPORT CPPuddle - LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/lib ) install(TARGETS stream_manager EXPORT CPPuddle - LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/lib ) # install all headers install( diff --git a/include/stream_manager.hpp b/include/stream_manager.hpp index 0984678d..d3bda59d 100644 --- a/include/stream_manager.hpp +++ b/include/stream_manager.hpp @@ -212,49 +212,35 @@ class stream_pool { public: template static void init(size_t number_of_streams, Ts &&... executor_args) { - std::lock_guard guard(mut); - if (!access_instance) { - // NOLINTNEXTLINE(cppcoreguidelines-owning-memory) - access_instance.reset(new stream_pool()); - } - assert(access_instance); stream_pool_implementation::init( number_of_streams, std::forward(executor_args)...); } template static void cleanup() { - assert(access_instance); // should already be initialized stream_pool_implementation::cleanup(); } template static std::tuple get_interface() { - assert(access_instance); // should already be initialized return stream_pool_implementation::get_interface(); } template static void release_interface(size_t index) noexcept { - assert(access_instance); // should already be initialized stream_pool_implementation::release_interface(index); } template static bool interface_available(size_t load_limit) noexcept { - assert(access_instance); // should already be initialized return stream_pool_implementation::interface_available( load_limit); } template static size_t get_current_load() noexcept { - assert(access_instance); // should already be initialized return stream_pool_implementation::get_current_load(); } template static size_t get_next_device_id() noexcept { - assert(access_instance); // should already be initialized return stream_pool_implementation::get_next_device_id(); } private: - static std::unique_ptr access_instance; - static std::mutex mut; stream_pool() = default; private: @@ -314,7 +300,7 @@ class stream_pool { } private: - static std::unique_ptr pool_instance; + inline static std::unique_ptr pool_instance{}; stream_pool_implementation() = default; inline static std::mutex pool_mut{}; @@ -341,9 +327,9 @@ class stream_pool { stream_pool &operator=(stream_pool &&other) = delete; }; -template -std::unique_ptr> - stream_pool::stream_pool_implementation::pool_instance{}; +/* template */ +/* std::unique_ptr> */ +/* stream_pool::stream_pool_implementation::pool_instance{}; */ template class stream_interface { public: diff --git a/src/stream_manager_definitions.cpp b/src/stream_manager_definitions.cpp index 173190f1..e69de29b 100644 --- a/src/stream_manager_definitions.cpp +++ b/src/stream_manager_definitions.cpp @@ -1,10 +0,0 @@ -// Copyright (c) 2020-2021 Gregor Daiß -// -// Distributed under the Boost Software License, Version 1.0. (See accompanying -// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) - -#include "../include/stream_manager.hpp" - -// Instance defintions -std::unique_ptr stream_pool::access_instance{}; -std::mutex stream_pool::mut{}; From af03c65f29b595f579cfd6adaa7cf54129661db1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= Date: Fri, 2 Jun 2023 13:20:47 -0500 Subject: [PATCH 15/46] Remove stream manager src file --- src/stream_manager_definitions.cpp | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 src/stream_manager_definitions.cpp diff --git a/src/stream_manager_definitions.cpp b/src/stream_manager_definitions.cpp deleted file mode 100644 index e69de29b..00000000 From 7d48263efde16564bac712ad844389413c06523c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= Date: Fri, 2 Jun 2023 14:12:09 -0500 Subject: [PATCH 16/46] Add hpx_aware gpu allocators --- include/cuda_buffer_util.hpp | 11 +++++++++++ include/hip_buffer_util.hpp | 11 +++++++++++ include/sycl_buffer_util.hpp | 11 +++++++++++ tests/allocator_cuda_test.cu | 2 +- 4 files changed, 34 insertions(+), 1 deletion(-) diff --git a/include/cuda_buffer_util.hpp b/include/cuda_buffer_util.hpp index d2d0f596..d94c0708 100644 --- a/include/cuda_buffer_util.hpp +++ b/include/cuda_buffer_util.hpp @@ -7,6 +7,9 @@ #define CUDA_BUFFER_UTIL_HPP #include "buffer_manager.hpp" +#ifdef CPPUDDLE_HAVE_HPX +#include "hpx_buffer_util.hpp" +#endif #include #include @@ -103,6 +106,14 @@ using recycle_allocator_cuda_host = template ::value, int> = 0> using recycle_allocator_cuda_device = detail::recycle_allocator>; +#ifdef CPPUDDLE_HAVE_HPX +template ::value, int> = 0> +using numa_aware_recycle_allocator_cuda_host = + detail::numa_aware_aggressive_recycle_allocator>; +template ::value, int> = 0> +using hpx_aware_recycle_allocator_cuda_device = + detail::numa_aware_recycle_allocator>; +#endif template ::value, int> = 0> struct cuda_device_buffer { diff --git a/include/hip_buffer_util.hpp b/include/hip_buffer_util.hpp index 87d41e3b..1d1b3d61 100644 --- a/include/hip_buffer_util.hpp +++ b/include/hip_buffer_util.hpp @@ -7,6 +7,9 @@ #define HIP_BUFFER_UTIL_HPP #include "buffer_manager.hpp" +#ifdef CPPUDDLE_HAVE_HPX +#include "hpx_buffer_util.hpp" +#endif #include #include @@ -109,6 +112,14 @@ using recycle_allocator_hip_host = template ::value, int> = 0> using recycle_allocator_hip_device = detail::recycle_allocator>; +#ifdef CPPUDDLE_HAVE_HPX +template ::value, int> = 0> +using numa_aware_recycle_allocator_hip_host = + detail::numa_aware_aggressive_recycle_allocator>; +template ::value, int> = 0> +using hpx_aware_recycle_allocator_hip_device = + detail::numa_aware_recycle_allocator>; +#endif template ::value, int> = 0> struct hip_device_buffer { diff --git a/include/sycl_buffer_util.hpp b/include/sycl_buffer_util.hpp index 6469aa4e..28014da5 100644 --- a/include/sycl_buffer_util.hpp +++ b/include/sycl_buffer_util.hpp @@ -7,6 +7,9 @@ #define SYCL_BUFFER_UTIL_HPP #include "buffer_manager.hpp" +#ifdef CPPUDDLE_HAVE_HPX +#include "hpx_buffer_util.hpp" +#endif #include #include @@ -76,6 +79,14 @@ using recycle_allocator_sycl_host = template ::value, int> = 0> using recycle_allocator_sycl_device = detail::recycle_allocator>; +#ifdef CPPUDDLE_HAVE_HPX +template ::value, int> = 0> +using numa_aware_recycle_allocator_sycl_host = + detail::numa_aware_aggressive_recycle_allocator>; +template ::value, int> = 0> +using hpx_aware_recycle_allocator_sycl_device = + detail::numa_aware_recycle_allocator>; +#endif } // end namespace recycler #endif diff --git a/tests/allocator_cuda_test.cu b/tests/allocator_cuda_test.cu index 3d43c17c..5697e542 100644 --- a/tests/allocator_cuda_test.cu +++ b/tests/allocator_cuda_test.cu @@ -12,7 +12,7 @@ #include #include "../include/buffer_manager.hpp" -#include "../include/cuda_buffer_util.hpp" +/* #include "../include/cuda_buffer_util.hpp" */ constexpr size_t N = 200000; From 4a47942837a7898c882d7a14d794253ec68f1794 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= Date: Fri, 2 Jun 2023 15:19:58 -0500 Subject: [PATCH 17/46] Allow using without counters --- include/buffer_manager.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/buffer_manager.hpp b/include/buffer_manager.hpp index 776829fd..f1cdf15f 100644 --- a/include/buffer_manager.hpp +++ b/include/buffer_manager.hpp @@ -298,11 +298,13 @@ class buffer_recycler { // Should be fine when throwing as there's no real point in recovering at that stage std::lock_guard guard(mut); +#ifdef CPPUDDLE_HAVE_COUNTERS if (number_allocation == 0 && number_recycling == 0 && number_bad_alloc == 0 && number_creation == 0 && unused_buffer_list.empty() && buffer_map.empty()) { return; } +#endif for (auto &buffer_tuple : unused_buffer_list) { Host_Allocator alloc; if (std::get<3>(buffer_tuple)) { From 6819ebb6d3960a6b89665600db4606f218dd511a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= Date: Fri, 2 Jun 2023 16:21:18 -0500 Subject: [PATCH 18/46] Add location_id to aggregation buffers --- include/aggregation_manager.hpp | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/include/aggregation_manager.hpp b/include/aggregation_manager.hpp index cd0f3afb..3319e498 100644 --- a/include/aggregation_manager.hpp +++ b/include/aggregation_manager.hpp @@ -529,7 +529,7 @@ template class Aggregated_Executor { /// Data entry for a buffer allocation: void* pointer, size_t for /// buffer-size, atomic for the slice counter using buffer_entry_t = - std::tuple, bool>; + std::tuple, bool, const size_t>; /// Keeps track of the aggregated buffer allocations done in all the slices std::deque buffer_allocations; /// Map pointer to deque index for fast access in the deallocations @@ -552,15 +552,17 @@ template class Aggregated_Executor { if (buffer_counter <= slice_alloc_counter) { constexpr bool manage_content_lifetime = false; buffers_in_use = true; + // get prefered location: aka the current hpx threads location + const size_t location_id = hpx::get_worker_thread_num(); // Get shiny and new buffer that will be shared between all slices // Buffer might be recycled from previous allocations by the // buffer_recycler... T *aggregated_buffer = - recycler::detail::buffer_recycler::get(size, - manage_content_lifetime); + recycler::detail::buffer_recycler::get( + size, manage_content_lifetime, location_id); // Create buffer entry for this buffer buffer_allocations.emplace_back(static_cast(aggregated_buffer), - size, 1, true); + size, 1, true, location_id); #ifndef NDEBUG // if previousely used the buffer should not be in usage anymore @@ -613,6 +615,7 @@ template class Aggregated_Executor { const auto buffer_size = std::get<1>(buffer_allocations[slice_alloc_counter]); auto &buffer_allocation_counter = std::get<2>(buffer_allocations[slice_alloc_counter]); auto &valid = std::get<3>(buffer_allocations[slice_alloc_counter]); + const auto &location_id = std::get<4>(buffer_allocations[slice_alloc_counter]); assert(valid); T *buffer_pointer = static_cast(buffer_pointer_void); @@ -630,7 +633,7 @@ template class Aggregated_Executor { if (valid) { assert(buffers_in_use == true); recycler::detail::buffer_recycler::mark_unused( - buffer_pointer, buffer_size); + buffer_pointer, buffer_size, location_id); // mark buffer as invalid to prevent any other slice from marking the // buffer as unused valid = false; @@ -752,9 +755,9 @@ template class Aggregated_Executor { std::lock_guard guard(buffer_mut); #ifndef NDEBUG for (const auto &buffer_entry : buffer_allocations) { - const auto &[buffer_pointer_any, buffer_size, - buffer_allocation_counter, - valid] = buffer_entry; + const auto &[buffer_pointer_any, buffer_size, + buffer_allocation_counter, valid, location_id] = + buffer_entry; assert(!valid); } #endif @@ -879,12 +882,11 @@ template class Aggregated_Executor { overall_launch_counter = 0; #ifndef NDEBUG for (const auto &buffer_entry : buffer_allocations) { - const auto &[buffer_pointer_any, buffer_size, - buffer_allocation_counter, - valid] = buffer_entry; + const auto &[buffer_pointer_any, buffer_size, buffer_allocation_counter, + valid, location_id] = buffer_entry; assert(!valid); } -#endif +#endif buffer_allocations.clear(); buffer_allocations_map.clear(); buffer_counter = 0; From c8516c8df1e1478fa8b9bf15ebcf6becdfaae3d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= Date: Fri, 2 Jun 2023 16:26:52 -0500 Subject: [PATCH 19/46] Cleanup tuples --- include/aggregation_manager.hpp | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/include/aggregation_manager.hpp b/include/aggregation_manager.hpp index 3319e498..7fd54b41 100644 --- a/include/aggregation_manager.hpp +++ b/include/aggregation_manager.hpp @@ -609,13 +609,8 @@ template class Aggregated_Executor { size_t slice_alloc_counter = buffer_allocations_map[p]; assert(slice_alloc_counter < buffer_allocations.size()); - /*auto &[buffer_pointer_any, buffer_size, buffer_allocation_counter, valid] = - buffer_allocations[slice_alloc_counter];*/ - auto buffer_pointer_void = std::get<0>(buffer_allocations[slice_alloc_counter]); - const auto buffer_size = std::get<1>(buffer_allocations[slice_alloc_counter]); - auto &buffer_allocation_counter = std::get<2>(buffer_allocations[slice_alloc_counter]); - auto &valid = std::get<3>(buffer_allocations[slice_alloc_counter]); - const auto &location_id = std::get<4>(buffer_allocations[slice_alloc_counter]); + auto &[buffer_pointer_void, buffer_size, buffer_allocation_counter, valid, location_id] = + buffer_allocations[slice_alloc_counter]; assert(valid); T *buffer_pointer = static_cast(buffer_pointer_void); From bcda3b31d8be22a833b0ad247e399e2dcafeb2c5 Mon Sep 17 00:00:00 2001 From: Gregor Daiss Date: Fri, 2 Jun 2023 16:28:46 -0500 Subject: [PATCH 20/46] Revert "Cleanup tuples" This reverts commit c8516c8df1e1478fa8b9bf15ebcf6becdfaae3d0. --- include/aggregation_manager.hpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/include/aggregation_manager.hpp b/include/aggregation_manager.hpp index 7fd54b41..3319e498 100644 --- a/include/aggregation_manager.hpp +++ b/include/aggregation_manager.hpp @@ -609,8 +609,13 @@ template class Aggregated_Executor { size_t slice_alloc_counter = buffer_allocations_map[p]; assert(slice_alloc_counter < buffer_allocations.size()); - auto &[buffer_pointer_void, buffer_size, buffer_allocation_counter, valid, location_id] = - buffer_allocations[slice_alloc_counter]; + /*auto &[buffer_pointer_any, buffer_size, buffer_allocation_counter, valid] = + buffer_allocations[slice_alloc_counter];*/ + auto buffer_pointer_void = std::get<0>(buffer_allocations[slice_alloc_counter]); + const auto buffer_size = std::get<1>(buffer_allocations[slice_alloc_counter]); + auto &buffer_allocation_counter = std::get<2>(buffer_allocations[slice_alloc_counter]); + auto &valid = std::get<3>(buffer_allocations[slice_alloc_counter]); + const auto &location_id = std::get<4>(buffer_allocations[slice_alloc_counter]); assert(valid); T *buffer_pointer = static_cast(buffer_pointer_void); From 6e47c668a9b9112315c6975f6857c2ff5da74f8f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= Date: Fri, 2 Jun 2023 17:26:24 -0500 Subject: [PATCH 21/46] Add numa aware aligned allocator --- CMakeLists.txt | 10 ++++++++-- include/aligned_buffer_util.hpp | 14 ++++++++++++++ 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index dc32ac10..4f573d98 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -132,8 +132,14 @@ if (CPPUDDLE_WITH_TESTS) ${Boost_LIBRARIES} Boost::boost Boost::program_options buffer_manager) add_executable(allocator_aligned_test tests/allocator_aligned_test.cpp) - target_link_libraries(allocator_aligned_test - ${Boost_LIBRARIES} Boost::boost Boost::program_options buffer_manager) + if (CPPUDDLE_WITH_HPX) + target_link_libraries(allocator_aligned_test + ${Boost_LIBRARIES} HPX::hpx Boost::boost Boost::program_options buffer_manager) + else() + target_link_libraries(allocator_aligned_test + ${Boost_LIBRARIES} HPX::hpx Boost::boost Boost::program_options buffer_manager) + endif() + if (CPPUDDLE_WITH_HPX) diff --git a/include/aligned_buffer_util.hpp b/include/aligned_buffer_util.hpp index 456420bc..8c83e3a9 100644 --- a/include/aligned_buffer_util.hpp +++ b/include/aligned_buffer_util.hpp @@ -8,6 +8,9 @@ #include "buffer_manager.hpp" #include +#ifdef CPPUDDLE_HAVE_HPX +#include "hpx_buffer_util.hpp" +#endif namespace recycler { template ::value, int> = 0> using aggressive_recycle_aligned = detail::aggressive_recycle_allocator< T, boost::alignment::aligned_allocator>; +#ifdef CPPUDDLE_HAVE_HPX +template ::value, int> = 0> +using numa_aware_recycle_aligned = detail::numa_aware_recycle_allocator< + T, boost::alignment::aligned_allocator>; +template ::value, int> = 0> +using numa_aware_aggressive_recycle_aligned = + detail::numa_aware_aggressive_recycle_allocator< + T, boost::alignment::aligned_allocator>; +#endif } // namespace recycler #endif From 2e4a78f96c5688e378e56503234dd6e88a6259d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= Date: Fri, 2 Jun 2023 18:37:15 -0500 Subject: [PATCH 22/46] Fix aligned test --- CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 4f573d98..e0c53ac8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -13,7 +13,7 @@ set(CPPUDDLE_WITH_DEADLOCK_TEST_REPETITONS "100000" CACHE STRING "Number of repe option(CPPUDDLE_WITH_COUNTERS "Turns on allocations counters. Useful for extended testing" OFF) option(CPPUDDLE_WITH_CUDA "Enable CUDA tests/examples" OFF) option(CPPUDDLE_WITH_MULTIGPU_SUPPORT "Enables experimental MultiGPU support" OFF) -option(CPPUDDLE_WITH_HPX "Enable HPX examples" OFF) +option(CPPUDDLE_WITH_HPX "Enable HPX integration and examples" OFF) option(CPPUDDLE_WITH_KOKKOS "Enable KOKKOS tests/examples" OFF) option(CPPUDDLE_WITH_CLANG_TIDY "Enable clang tidy warnings" OFF) option(CPPUDDLE_WITH_CLANG_FORMAT "Enable clang format target" OFF) @@ -137,7 +137,7 @@ if (CPPUDDLE_WITH_TESTS) ${Boost_LIBRARIES} HPX::hpx Boost::boost Boost::program_options buffer_manager) else() target_link_libraries(allocator_aligned_test - ${Boost_LIBRARIES} HPX::hpx Boost::boost Boost::program_options buffer_manager) + ${Boost_LIBRARIES} Boost::boost Boost::program_options buffer_manager) endif() From b38b24154de7ccf7edc75723a57dabb171a80eb1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= Date: Fri, 2 Jun 2023 18:46:20 -0500 Subject: [PATCH 23/46] Disable aligned valgrind test and output test errors --- .github/workflows/cmake.yml | 2 +- CMakeLists.txt | 8 -------- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml index 5ff62786..a6fd9f4f 100644 --- a/.github/workflows/cmake.yml +++ b/.github/workflows/cmake.yml @@ -39,4 +39,4 @@ jobs: - name: Test working-directory: ${{github.workspace}}/build shell: bash - run: ctest + run: ctest --output-on-failure diff --git a/CMakeLists.txt b/CMakeLists.txt index e0c53ac8..02840fff 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -341,14 +341,6 @@ if (CPPUDDLE_WITH_TESTS) set_tests_properties(allocator_aligned_test.fixture_cleanup PROPERTIES FIXTURES_CLEANUP allocator_aligned_test_output ) - find_program(VALGRIND_COMMAND valgrind) - if (VALGRIND_COMMAND) - add_test(allocator_memcheck.valgrind - ${VALGRIND_COMMAND} --trace-children=yes --leak-check=full ./allocator_aligned_test --arraysize 5000000 --passes 200) - set_tests_properties(allocator_memcheck.valgrind PROPERTIES - PASS_REGULAR_EXPRESSION "ERROR SUMMARY: 0 errors from 0 contexts" - ) - endif() if (CPPUDDLE_WITH_HPX) # Concurrency tests From ea63c12611ad4afc164a3e1a9543d1acf4fb29d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= Date: Fri, 2 Jun 2023 19:36:25 -0500 Subject: [PATCH 24/46] Fix double free issue --- include/buffer_manager.hpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/include/buffer_manager.hpp b/include/buffer_manager.hpp index f1cdf15f..39ab1deb 100644 --- a/include/buffer_manager.hpp +++ b/include/buffer_manager.hpp @@ -88,10 +88,7 @@ class buffer_recycler { } public: - /* ~buffer_recycler() = default; // public destructor for unique_ptr instance */ - ~buffer_recycler() { - clean_all(); - } + ~buffer_recycler() = default; // public destructor for unique_ptr instance // Subclasses private: From 0545b443ddc1485a03b9a41b0f176e780e3afae6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= Date: Fri, 2 Jun 2023 23:35:15 -0500 Subject: [PATCH 25/46] Protect callback lists --- include/buffer_manager.hpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/include/buffer_manager.hpp b/include/buffer_manager.hpp index 39ab1deb..56d00053 100644 --- a/include/buffer_manager.hpp +++ b/include/buffer_manager.hpp @@ -22,6 +22,7 @@ #endif + namespace recycler { constexpr size_t number_instances = 128; namespace detail { @@ -45,6 +46,7 @@ class buffer_recycler { } /// Deallocate all buffers, no matter whether they are marked as used or not static void clean_all() { + std::lock_guard guard(instance().callback_protection_mut); for (const auto &clean_function : instance().total_cleanup_callbacks) { clean_function(); @@ -52,6 +54,7 @@ class buffer_recycler { } /// Deallocated all currently unused buffer static void clean_unused_buffers() { + std::lock_guard guard(instance().callback_protection_mut); for (const auto &clean_function : instance().partial_cleanup_callbacks) { clean_function(); @@ -75,15 +78,17 @@ class buffer_recycler { /// default, private constructor - not automatically constructed due to the /// deleted constructors buffer_recycler() = default; + + std::mutex callback_protection_mut; /// Add a callback function that gets executed upon cleanup and destruction static void add_total_cleanup_callback(const std::function &func) { - /* std::lock_guard guard(instance().mut); */ + std::lock_guard guard(instance().callback_protection_mut); instance().total_cleanup_callbacks.push_back(func); } /// Add a callback function that gets executed upon partial (unused memory) /// cleanup static void add_partial_cleanup_callback(const std::function &func) { - /* std::lock_guard guard(instance().mut); */ + std::lock_guard guard(instance().callback_protection_mut); instance().partial_cleanup_callbacks.push_back(func); } From 96b5d16cc1da80bc8bcada73db9ec6761fc1449c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= Date: Sat, 3 Jun 2023 23:57:57 -0500 Subject: [PATCH 26/46] Allow for more mutex choices --- include/buffer_manager.hpp | 49 ++++++++++++++++++++++++-------- tests/allocator_aligned_test.cpp | 21 +++++++++++++- tests/allocator_cuda_test.cu | 3 +- tests/allocator_hpx_test.cpp | 12 -------- tests/allocator_kokkos_test.cpp | 21 +++++++++++++- tests/allocator_test.cpp | 18 ++++++++++++ 6 files changed, 97 insertions(+), 27 deletions(-) diff --git a/include/buffer_manager.hpp b/include/buffer_manager.hpp index 56d00053..89eea77f 100644 --- a/include/buffer_manager.hpp +++ b/include/buffer_manager.hpp @@ -17,16 +17,39 @@ #include #include +#ifdef CPPUDDLE_HAVE_HPX +// For builds with The HPX mutex +#include +#endif + #ifdef CPPUDDLE_HAVE_COUNTERS #include #endif - +// TODO Switch mutex_t globally? +// -- Problem: Mutex only works when HPX is initialized (what about non HPX +// builds?) +// -- What about the times when HPX is not initialized anymore (locks in +// destructor + static) +// TODO add mutex type to template parameter list? +// -- What about adding stuff to other parts of the code? mutex for callbacks? +// +// Decision: +// Switch globally +// Otherwise the template mutex parameter leaks into the interface of the +// allocators which complicates both the code and the usage. It also generates +// corner cases of intermixing mutexes... namespace recycler { constexpr size_t number_instances = 128; namespace detail { +#ifdef CPPUDDLE_HAVE_HPX +using mutex_t = hpx::lcos::local::mutex; +#else +using mutex_t = std::mutex; +#endif + class buffer_recycler { // Public interface public: @@ -46,7 +69,7 @@ class buffer_recycler { } /// Deallocate all buffers, no matter whether they are marked as used or not static void clean_all() { - std::lock_guard guard(instance().callback_protection_mut); + std::lock_guard guard(instance().callback_protection_mut); for (const auto &clean_function : instance().total_cleanup_callbacks) { clean_function(); @@ -54,7 +77,7 @@ class buffer_recycler { } /// Deallocated all currently unused buffer static void clean_unused_buffers() { - std::lock_guard guard(instance().callback_protection_mut); + std::lock_guard guard(instance().callback_protection_mut); for (const auto &clean_function : instance().partial_cleanup_callbacks) { clean_function(); @@ -79,16 +102,16 @@ class buffer_recycler { /// deleted constructors buffer_recycler() = default; - std::mutex callback_protection_mut; + mutex_t callback_protection_mut; /// Add a callback function that gets executed upon cleanup and destruction static void add_total_cleanup_callback(const std::function &func) { - std::lock_guard guard(instance().callback_protection_mut); + std::lock_guard guard(instance().callback_protection_mut); instance().total_cleanup_callbacks.push_back(func); } /// Add a callback function that gets executed upon partial (unused memory) /// cleanup static void add_partial_cleanup_callback(const std::function &func) { - std::lock_guard guard(instance().callback_protection_mut); + std::lock_guard guard(instance().callback_protection_mut); instance().partial_cleanup_callbacks.push_back(func); } @@ -113,7 +136,7 @@ class buffer_recycler { /// Cleanup all buffers not currently in use static void clean_unused_buffers_only() { for (auto i = 0; i < number_instances; i++) { - std::lock_guard guard(instance()[i].mut); + std::lock_guard guard(instance()[i].mut); for (auto &buffer_tuple : instance()[i].unused_buffer_list) { Host_Allocator alloc; if (std::get<3>(buffer_tuple)) { @@ -134,7 +157,7 @@ class buffer_recycler { if (location_hint) { location_id = location_hint.value(); } - std::lock_guard guard(instance()[location_id].mut); + std::lock_guard guard(instance()[location_id].mut); #ifdef CPPUDDLE_HAVE_COUNTERS @@ -181,6 +204,7 @@ class buffer_recycler { return buffer; } catch (std::bad_alloc &e) { // not enough memory left! Cleanup and attempt again: + std::cerr << "Not enough memory left. Cleaning up unused buffers now..." << std::endl; buffer_recycler::clean_unused_buffers(); // If there still isn't enough memory left, the caller has to handle it @@ -206,7 +230,7 @@ class buffer_recycler { if (location_hint) { size_t location_id = location_hint.value(); - std::lock_guard guard(instance()[location_id].mut); + std::lock_guard guard(instance()[location_id].mut); if (instance()[location_id].buffer_map.find(memory_location) != instance()[location_id].buffer_map.end()) { #ifdef CPPUDDLE_HAVE_COUNTERS @@ -235,7 +259,7 @@ class buffer_recycler { continue; // already tried this -> skip } } - std::lock_guard guard(instance()[location_id].mut); + std::lock_guard guard(instance()[location_id].mut); if (instance()[location_id].buffer_map.find(memory_location) != instance()[location_id].buffer_map.end()) { #ifdef CPPUDDLE_HAVE_COUNTERS @@ -263,7 +287,7 @@ class buffer_recycler { /// List with all buffers currently not used std::list unused_buffer_list{}; /// Access control - std::mutex mut; + mutex_t mut; #ifdef CPPUDDLE_HAVE_COUNTERS /// Performance counters size_t number_allocation{0}, number_dealloacation{0}, number_wrong_hints{0}; @@ -298,7 +322,8 @@ class buffer_recycler { ~buffer_manager() { // All operations should have finished before this is happening // Should be fine when throwing as there's no real point in recovering at that stage - std::lock_guard guard(mut); + // TODO mutex here is a bad idea as the HPX runtime is already shut down + /* std::lock_guard guard(mut); */ #ifdef CPPUDDLE_HAVE_COUNTERS if (number_allocation == 0 && number_recycling == 0 && diff --git a/tests/allocator_aligned_test.cpp b/tests/allocator_aligned_test.cpp index 552774eb..882a2c0c 100644 --- a/tests/allocator_aligned_test.cpp +++ b/tests/allocator_aligned_test.cpp @@ -3,8 +3,11 @@ // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) -#include "../include/aligned_buffer_util.hpp" #include "../include/buffer_manager.hpp" +#include "../include/aligned_buffer_util.hpp" +#ifdef CPPUDDLE_HAVE_HPX +#include +#endif #include #include @@ -15,7 +18,11 @@ #include #include +#ifdef CPPUDDLE_HAVE_HPX +int hpx_main(int argc, char *argv[]) { +#else int main(int argc, char *argv[]) { +#endif size_t array_size = 500000; size_t passes = 10000; @@ -137,5 +144,17 @@ int main(int argc, char *argv[]) { std::cout << "Test information: Recycler was faster than default allocator!" << std::endl; } +#ifdef CPPUDDLE_HAVE_HPX + return hpx::finalize(); +#else return EXIT_SUCCESS; +#endif +} + +#ifdef CPPUDDLE_HAVE_HPX +int main(int argc, char *argv[]) { + hpx::init_params p; + p.cfg = {"hpx.commandline.allow_unknown=1"}; + return hpx::init(argc, argv, p); } +#endif diff --git a/tests/allocator_cuda_test.cu b/tests/allocator_cuda_test.cu index 5697e542..f646914d 100644 --- a/tests/allocator_cuda_test.cu +++ b/tests/allocator_cuda_test.cu @@ -3,6 +3,7 @@ // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +//TODO delete obsolete test or fix/modernize? #include #include #include @@ -11,7 +12,7 @@ #include -#include "../include/buffer_manager.hpp" +/* #include "../include/buffer_manager.hpp" */ /* #include "../include/cuda_buffer_util.hpp" */ diff --git a/tests/allocator_hpx_test.cpp b/tests/allocator_hpx_test.cpp index 8dae8d7b..2e48f925 100644 --- a/tests/allocator_hpx_test.cpp +++ b/tests/allocator_hpx_test.cpp @@ -131,18 +131,6 @@ int hpx_main(int argc, char *argv[]) { recycler::force_cleanup(); // Cleanup all buffers and the managers for better // comparison - // ensure that at least 4 buffers have to created for unit testing - { - std::vector> buffer1( - array_size, double{}); - std::vector> buffer2( - array_size, double{}); - std::vector> buffer3( - array_size, double{}); - std::vector> buffer4( - array_size, double{}); - } - // Aggressive recycle Test: { auto begin = std::chrono::high_resolution_clock::now(); diff --git a/tests/allocator_kokkos_test.cpp b/tests/allocator_kokkos_test.cpp index af813457..de808859 100644 --- a/tests/allocator_kokkos_test.cpp +++ b/tests/allocator_kokkos_test.cpp @@ -16,6 +16,9 @@ #include "../include/buffer_manager.hpp" #include "../include/cuda_buffer_util.hpp" #include "../include/kokkos_buffer_util.hpp" +#ifdef CPPUDDLE_HAVE_HPX +#include +#endif #include #include #include @@ -32,8 +35,11 @@ template using recycled_host_view = recycler::recycled_view, recycler::recycle_std, T>; -// #pragma nv_exec_check_disable +#ifdef CPPUDDLE_HAVE_HPX +int hpx_main(int argc, char *argv[]) { +#else int main(int argc, char *argv[]) { +#endif std::string filename{}; try { @@ -84,4 +90,17 @@ int main(int argc, char *argv[]) { }); Kokkos::fence(); } +#ifdef CPPUDDLE_HAVE_HPX + return hpx::finalize(); +#else + return EXIT_SUCCESS; +#endif } + +#ifdef CPPUDDLE_HAVE_HPX +int main(int argc, char *argv[]) { + hpx::init_params p; + p.cfg = {"hpx.commandline.allow_unknown=1"}; + return hpx::init(argc, argv, p); +} +#endif diff --git a/tests/allocator_test.cpp b/tests/allocator_test.cpp index 12ad1eb4..e86f00ce 100644 --- a/tests/allocator_test.cpp +++ b/tests/allocator_test.cpp @@ -4,6 +4,9 @@ // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) #include "../include/buffer_manager.hpp" +#ifdef CPPUDDLE_HAVE_HPX +#include +#endif #include #include @@ -14,7 +17,11 @@ #include #include +#ifdef CPPUDDLE_HAVE_HPX +int hpx_main(int argc, char *argv[]) { +#else int main(int argc, char *argv[]) { +#endif size_t array_size = 500000; size_t passes = 10000; @@ -129,5 +136,16 @@ int main(int argc, char *argv[]) { std::cout << "Test information: Recycler was faster than default allocator!" << std::endl; } +#ifdef CPPUDDLE_HAVE_HPX + return hpx::finalize(); +#else return EXIT_SUCCESS; +#endif } +#ifdef CPPUDDLE_HAVE_HPX +int main(int argc, char *argv[]) { + hpx::init_params p; + p.cfg = {"hpx.commandline.allow_unknown=1"}; + return hpx::init(argc, argv, p); +} +#endif From 11e31dc49553a27ebfd63df7532394b2e9565eb2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= Date: Sun, 4 Jun 2023 09:42:44 -0500 Subject: [PATCH 27/46] Fix deprecation warnings --- include/aggregation_manager.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/aggregation_manager.hpp b/include/aggregation_manager.hpp index 3319e498..4441c235 100644 --- a/include/aggregation_manager.hpp +++ b/include/aggregation_manager.hpp @@ -42,7 +42,7 @@ #include "../include/buffer_manager.hpp" #include "../include/stream_manager.hpp" -using aggregation_mutex_t = hpx::lcos::local::mutex; +using aggregation_mutex_t = hpx::mutex; //=============================================================================== //=============================================================================== @@ -902,8 +902,8 @@ template class Aggregated_Executor { executor_tuple( stream_pool::get_interface>()), executor(std::get<0>(executor_tuple)), - current_continuation(hpx::lcos::make_ready_future()), - last_stream_launch_done(hpx::lcos::make_ready_future()) {} + current_continuation(hpx::make_ready_future()), + last_stream_launch_done(hpx::make_ready_future()) {} // Not meant to be copied or moved Aggregated_Executor(const Aggregated_Executor &other) = delete; Aggregated_Executor &operator=(const Aggregated_Executor &other) = delete; From 2b4ce38c6e0fa0bbf547cf4c0d17c59578f65110 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= Date: Sun, 4 Jun 2023 09:42:58 -0500 Subject: [PATCH 28/46] Add finalize method --- include/buffer_manager.hpp | 83 ++++++++++++++++++++++++++------------ 1 file changed, 58 insertions(+), 25 deletions(-) diff --git a/include/buffer_manager.hpp b/include/buffer_manager.hpp index 89eea77f..6ca57f1c 100644 --- a/include/buffer_manager.hpp +++ b/include/buffer_manager.hpp @@ -26,26 +26,12 @@ #include #endif -// TODO Switch mutex_t globally? -// -- Problem: Mutex only works when HPX is initialized (what about non HPX -// builds?) -// -- What about the times when HPX is not initialized anymore (locks in -// destructor + static) -// TODO add mutex type to template parameter list? -// -- What about adding stuff to other parts of the code? mutex for callbacks? -// -// Decision: -// Switch globally -// Otherwise the template mutex parameter leaks into the interface of the -// allocators which complicates both the code and the usage. It also generates -// corner cases of intermixing mutexes... - namespace recycler { constexpr size_t number_instances = 128; namespace detail { #ifdef CPPUDDLE_HAVE_HPX -using mutex_t = hpx::lcos::local::mutex; +using mutex_t = hpx::mutex; #else using mutex_t = std::mutex; #endif @@ -83,6 +69,14 @@ class buffer_recycler { clean_function(); } } + /// Deallocate all buffers, no matter whether they are marked as used or not + static void finalize() { + std::lock_guard guard(instance().callback_protection_mut); + for (const auto &finalize_function : + instance().finalize_callbacks) { + finalize_function(); + } + } // Member variables and methods private: @@ -92,8 +86,11 @@ class buffer_recycler { static buffer_recycler singleton{}; return singleton; } - /// Callbacks for buffer_manager cleanups - each callback completely destroys + /// Callbacks for buffer_manager finalize - each callback completely destroys /// one buffer_manager + std::list> finalize_callbacks; + /// Callbacks for buffer_manager cleanups - each callback destroys all buffers within + /// one buffer_manager, both used and unsued std::list> total_cleanup_callbacks; /// Callbacks for partial buffer_manager cleanups - each callback deallocates /// all unused buffers of a manager @@ -114,6 +111,12 @@ class buffer_recycler { std::lock_guard guard(instance().callback_protection_mut); instance().partial_cleanup_callbacks.push_back(func); } + /// Add a callback function that gets executed upon partial (unused memory) + /// cleanup + static void add_finalize_callback(const std::function &func) { + std::lock_guard guard(instance().callback_protection_mut); + instance().finalize_callbacks.push_back(func); + } public: ~buffer_recycler() = default; // public destructor for unique_ptr instance @@ -131,10 +134,24 @@ class buffer_recycler { public: /// Cleanup and delete this singleton static void clean() { - instance().reset(new buffer_manager[number_instances]); + assert(instance() && !is_finalized); + for (auto i = 0; i < number_instances; i++) { + std::lock_guard guard(instance()[i].mut); + instance()[i].clean_all_buffers(); + } + } + static void finalize() { + assert(instance() && !is_finalized); + is_finalized = true; + for (auto i = 0; i < number_instances; i++) { + std::lock_guard guard(instance()[i].mut); + instance()[i].clean_all_buffers(); + } + instance().reset(); } /// Cleanup all buffers not currently in use static void clean_unused_buffers_only() { + assert(instance() && !is_finalized); for (auto i = 0; i < number_instances; i++) { std::lock_guard guard(instance()[i].mut); for (auto &buffer_tuple : instance()[i].unused_buffer_list) { @@ -152,6 +169,10 @@ class buffer_recycler { static T *get(size_t number_of_elements, bool manage_content_lifetime, std::optional location_hint = std::nullopt) { init_callbacks_once(); + if (is_finalized) { + throw std::runtime_error("Tried allocation after finalization"); + } + assert(instance() && !is_finalized); size_t location_id = 0; if (location_hint) { @@ -227,6 +248,9 @@ class buffer_recycler { static void mark_unused(T *memory_location, size_t number_of_elements, std::optional location_hint = std::nullopt) { + if (is_finalized) + return; + assert(instance() && !is_finalized); if (location_hint) { size_t location_id = location_hint.value(); @@ -309,22 +333,26 @@ class buffer_recycler { return instances; } static void init_callbacks_once(void) { - static std::once_flag flag; + assert(instance()); +#ifdef CPPUDDLE_HAVE_HPX + static hpx::once_flag flag; + hpx::call_once(flag, []() { +#else + static std::once_flag flag; std::call_once(flag, []() { +#endif + is_finalized = false; buffer_recycler::add_total_cleanup_callback(clean); buffer_recycler::add_partial_cleanup_callback( clean_unused_buffers_only); + buffer_recycler::add_finalize_callback( + finalize); }); } + static inline std::atomic is_finalized; - public: - ~buffer_manager() { - // All operations should have finished before this is happening - // Should be fine when throwing as there's no real point in recovering at that stage - // TODO mutex here is a bad idea as the HPX runtime is already shut down - /* std::lock_guard guard(mut); */ - + void clean_all_buffers(void) { #ifdef CPPUDDLE_HAVE_COUNTERS if (number_allocation == 0 && number_recycling == 0 && number_bad_alloc == 0 && number_creation == 0 && @@ -386,6 +414,10 @@ class buffer_recycler { unused_buffer_list.clear(); buffer_map.clear(); } + public: + ~buffer_manager() { + clean_all_buffers(); + } public: // Putting deleted constructors in public gives more useful error // messages @@ -500,6 +532,7 @@ using aggressive_recycle_std = inline void force_cleanup() { detail::buffer_recycler::clean_all(); } /// Deletes all buffers currently marked as unused inline void cleanup() { detail::buffer_recycler::clean_unused_buffers(); } +inline void finalize() { detail::buffer_recycler::finalize(); } } // end namespace recycler From 6e586b03a9029b681d77effcb511888326e45532 Mon Sep 17 00:00:00 2001 From: Gregor Daiss Date: Sun, 4 Jun 2023 09:54:53 -0500 Subject: [PATCH 29/46] Add buran machine to config --- scripts/build_dependencies.sh | 3 +++ scripts/configure_build_directory.sh | 3 +++ scripts/machine_configs.sh | 27 +++++++++++++++++++++++++++ 3 files changed, 33 insertions(+) diff --git a/scripts/build_dependencies.sh b/scripts/build_dependencies.sh index edfd2c38..0c6ced74 100755 --- a/scripts/build_dependencies.sh +++ b/scripts/build_dependencies.sh @@ -44,6 +44,9 @@ case $(hostname) in toranj*) source_config_toranj ${SCRIPTS_DIR} "$1" "$2" ;; + buran*) + source_config_buran ${SCRIPTS_DIR} "$1" "$2" + ;; *) source_config_default ${SCRIPTS_DIR} "$1" "$2" ;; diff --git a/scripts/configure_build_directory.sh b/scripts/configure_build_directory.sh index 177b8b81..1b2f6fd8 100755 --- a/scripts/configure_build_directory.sh +++ b/scripts/configure_build_directory.sh @@ -39,6 +39,9 @@ case $(hostname) in toranj*) source_config_toranj ${SCRIPTS_DIR} "$1" "$2" ;; + buran*) + source_config_buran ${SCRIPTS_DIR} "$1" "$2" + ;; *) source_config_default ${SCRIPTS_DIR} "$1" "$2" ;; diff --git a/scripts/machine_configs.sh b/scripts/machine_configs.sh index 4637b09c..84ad286d 100644 --- a/scripts/machine_configs.sh +++ b/scripts/machine_configs.sh @@ -119,6 +119,33 @@ function source_config_toranj() { export CURRENT_CUDA_ARCH_FLAG="-DKokkos_ARCH_SKX=ON -DKokkos_ARCH_AMPERE80=ON" } +function source_config_buran() { + SCRIPTS_DIR="$1" + export CMAKE_BUILD_TYPE="$2" + if [[ "${3}" == "gcc" ]]; then + export CXX=${SCRIPTS_DIR}/../external_dependencies/kokkos/bin/nvcc_wrapper + export NVCC_WRAPPER_DEFAULT_COMPILER="g++" + export HPX_COMPILER="g++" + elif [[ "${3}" == "clang" ]]; then + export CXX=clang++ + export HPX_COMPILER=clang++ + else + echo "Invalid compiler!" + exit 1 + fi + export APPEND_DIRNAME="$3-$2" + + + #export CXX=${SCRIPTS_DIR}/../external_dependencies/kokkos/bin/nvcc_wrapper + export CXXFLAGS="-Wno-cpp" # Silence deprecated header warnings in HPX + #export CMAKE_BUILD_TYPE=Release + export HPX_ROOT=${SCRIPTS_DIR}/../external_dependencies/install/hpx-${APPEND_DIRNAME}/lib64/cmake/HPX + export Kokkos_ROOT=${SCRIPTS_DIR}/../external_dependencies/install/kokkos-${APPEND_DIRNAME}/lib64/cmake/Kokkos + export HPXKokkos_ROOT=${SCRIPTS_DIR}/../external_dependencies/install/hpx-kokkos-${APPEND_DIRNAME}/lib64/cmake/HPXKokkos + + export CURRENT_CUDA_ARCH_FLAG="-DKokkos_ARCH_ZEN2=ON " +} + function source_config_default() { echo -e "\033[33mWARNING: Default configuration... You likely need to modify this\033[0m" sleep 8 From 05188818f2961e6ad90eafb5cdfa97c513ff4fec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= Date: Sun, 4 Jun 2023 09:57:31 -0500 Subject: [PATCH 30/46] Add missing cmake changes --- CMakeLists.txt | 57 ++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 44 insertions(+), 13 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 02840fff..83e142fd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,11 +3,16 @@ # Distributed under the Boost Software License, Version 1.0. (See accompanying # file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) -cmake_minimum_required(VERSION 3.11) +cmake_minimum_required(VERSION 3.16) project(CPPuddle CXX C) # Some random project name set(CMAKE_CXX_STANDARD 17) +set(CPPUDDLE_VERSION_MAJOR 0) +set(CPPUDDLE_VERSION_MINOR 1) +set(CPPUDDLE_VERSION_PATCH 99) +set(CPPUDDLE_VERSION_STRING "${CPPUDDLE_VERSION_MAJOR}.${CPPUDDLE_VERSION_MINOR}.${CPPUDDLE_VERSION_PATCH}.") + option(CPPUDDLE_WITH_TESTS "Build tests/examples" OFF) set(CPPUDDLE_WITH_DEADLOCK_TEST_REPETITONS "100000" CACHE STRING "Number of repetitions for the aggregation executor deadlock tests") option(CPPUDDLE_WITH_COUNTERS "Turns on allocations counters. Useful for extended testing" OFF) @@ -128,8 +133,13 @@ install(EXPORT CPPuddle NAMESPACE CPPuddle:: DESTINATION ${CMAKE_INSTALL_PREFIX} ## Add target for tests and tests definitions if (CPPUDDLE_WITH_TESTS) add_executable(allocator_test tests/allocator_test.cpp) + if (CPPUDDLE_WITH_HPX) + target_link_libraries(allocator_test + ${Boost_LIBRARIES} HPX::hpx Boost::boost Boost::program_options buffer_manager) + else() target_link_libraries(allocator_test ${Boost_LIBRARIES} Boost::boost Boost::program_options buffer_manager) + endif() add_executable(allocator_aligned_test tests/allocator_aligned_test.cpp) if (CPPUDDLE_WITH_HPX) @@ -284,13 +294,23 @@ if (CPPUDDLE_WITH_TESTS) set_tests_properties(allocator_test.fixture_cleanup PROPERTIES FIXTURES_CLEANUP allocator_test_output ) - find_program(VALGRIND_COMMAND valgrind) - if (VALGRIND_COMMAND) - add_test(allocator_memcheck.valgrind - ${VALGRIND_COMMAND} --trace-children=yes --leak-check=full ./allocator_test --arraysize 5000000 --passes 200) - set_tests_properties(allocator_memcheck.valgrind PROPERTIES - PASS_REGULAR_EXPRESSION "ERROR SUMMARY: 0 errors from 0 contexts" - ) + # Valgrind test only works properly in non-HPX builds + # With HPX, we get errors straight from the HPX runtime. + # Non-HPX build is part of the github actions CI, the valgrind tests thus get done there + if (NOT (CPPUDDLE_WITH_HPX)) + find_program(VALGRIND_COMMAND valgrind) + if (VALGRIND_COMMAND) + add_test(allocator_memcheck.valgrind + ${VALGRIND_COMMAND} --trace-children=yes --leak-check=full ./allocator_test --arraysize 5000000 --passes 200) + set_tests_properties(allocator_memcheck.valgrind PROPERTIES + PASS_REGULAR_EXPRESSION "ERROR SUMMARY: 0 errors from 0 contexts" + ) + add_test(allocator_aligned_memcheck.valgrind + ${VALGRIND_COMMAND} --trace-children=yes --leak-check=full ./allocator_aligned_test --arraysize 5000000 --passes 200) + set_tests_properties(allocator_aligned_memcheck.valgrind PROPERTIES + PASS_REGULAR_EXPRESSION "ERROR SUMMARY: 0 errors from 0 contexts" + ) + endif() endif() # Aligned alloc tests @@ -344,7 +364,7 @@ if (CPPUDDLE_WITH_TESTS) if (CPPUDDLE_WITH_HPX) # Concurrency tests - add_test(allocator_concurrency_test.run allocator_hpx_test -t4 --passes 20 --outputfile allocator_concurrency_test.out) + add_test(allocator_concurrency_test.run allocator_hpx_test --hpx:threads=4 --passes 200 --futures=4 --outputfile allocator_concurrency_test.out) set_tests_properties(allocator_concurrency_test.run PROPERTIES FIXTURES_SETUP allocator_concurrency_output PROCESSORS 4 @@ -353,7 +373,7 @@ if (CPPUDDLE_WITH_TESTS) add_test(allocator_concurrency_test.analyse_recycle_rate cat allocator_concurrency_test.out) set_tests_properties(allocator_concurrency_test.analyse_recycle_rate PROPERTIES FIXTURES_REQUIRED allocator_concurrency_output - PASS_REGULAR_EXPRESSION "==> Recycle rate: [ ]* 99.6885%" + PASS_REGULAR_EXPRESSION "==> Recycle rate: [ ]* 99.5%" ) add_test(allocator_concurrency_test.analyse_marked_buffers_cleanup cat allocator_concurrency_test.out) set_tests_properties(allocator_concurrency_test.analyse_marked_buffers_cleanup PROPERTIES @@ -375,6 +395,11 @@ if (CPPUDDLE_WITH_TESTS) FIXTURES_REQUIRED allocator_concurrency_output PASS_REGULAR_EXPRESSION "--> Number of bad_allocs that triggered garbage collection: [ ]* 0" ) + add_test(allocator_concurrency_test.analyse_bad_hints cat allocator_concurrency_test.out) + set_tests_properties(allocator_concurrency_test.analyse_bad_allocs PROPERTIES + FIXTURES_REQUIRED allocator_concurrency_output + PASS_REGULAR_EXPRESSION "--> Number wrong deallocation hints: [ ]* 0" + ) endif() if (NOT CMAKE_BUILD_TYPE MATCHES "Debug") # Performance tests only make sense with optimizations on add_test(allocator_concurrency_test.performance.analyse_recycle_performance cat allocator_concurrency_test.out) @@ -464,21 +489,27 @@ if (CPPUDDLE_WITH_TESTS) FIXTURES_SETUP aggregation_basic_parallel_test_output PROCESSORS 4 ) + # new concurrent buffer managers change the game here.. as two aggregated runs are in parallel: 3 1 add_test(aggregation_basic_parallel_test.analyse_int_buffers cat aggregation_basic_parallel_test.out) set_tests_properties(aggregation_basic_parallel_test.analyse_int_buffers PROPERTIES FIXTURES_REQUIRED aggregation_basic_parallel_test_output - PASS_REGULAR_EXPRESSION "--> Number of buffers that got requested from this manager: [ ]* 2" + PASS_REGULAR_EXPRESSION "--> Number of buffers that got requested from this manager: [ ]* 1" ) add_test(aggregation_basic_parallel_test.analyse_float_buffers cat aggregation_basic_parallel_test.out) set_tests_properties(aggregation_basic_parallel_test.analyse_float_buffers PROPERTIES FIXTURES_REQUIRED aggregation_basic_parallel_test_output - PASS_REGULAR_EXPRESSION "--> Number of buffers that got requested from this manager: [ ]* 6" + PASS_REGULAR_EXPRESSION "--> Number of buffers that got requested from this manager: [ ]* 3" ) add_test(aggregation_basic_parallel_test.analyse_cleanup cat aggregation_basic_parallel_test.out) set_tests_properties(aggregation_basic_parallel_test.analyse_cleanup PROPERTIES FIXTURES_REQUIRED aggregation_basic_parallel_test_output PASS_REGULAR_EXPRESSION "--> Number of buffers that were marked as used upon cleanup: [ ]* 0" ) + add_test(aggregation_basic_parallel_test.analyse_cleanup cat aggregation_basic_parallel_test.out) + set_tests_properties(aggregation_basic_parallel_test.analyse_cleanup PROPERTIES + FIXTURES_REQUIRED aggregation_basic_parallel_test_output + PASS_REGULAR_EXPRESSION "--> Number wrong deallocation hints: [ ]* 0" + ) @@ -528,7 +559,7 @@ if (CPPUDDLE_WITH_TESTS) add_test(aggregation_add_pointer_test.analyse_number_buffers cat aggregation_add_pointer_test.out) set_tests_properties(aggregation_add_pointer_test.analyse_number_buffers PROPERTIES FIXTURES_REQUIRED aggregation_add_pointer_test_output - PASS_REGULAR_EXPRESSION "--> Number of buffers that got requested from this manager: [ ]* 6" + PASS_REGULAR_EXPRESSION "--> Number of buffers that got requested from this manager: [ ]* 3" ) add_test(aggregation_add_references_test.run work_aggregation_test -t4 --outputfile=aggregation_add_references_test.out --scenario=references_add_test) From 50d9625a55fdb222d82b5e315b8eefdbbd0bd6fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= Date: Sun, 4 Jun 2023 10:17:23 -0500 Subject: [PATCH 31/46] Allow mutex choice --- CMakeLists.txt | 19 +++++++++++++++++++ include/buffer_manager.hpp | 7 ++++--- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 83e142fd..713f2ec1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -23,6 +23,15 @@ option(CPPUDDLE_WITH_KOKKOS "Enable KOKKOS tests/examples" OFF) option(CPPUDDLE_WITH_CLANG_TIDY "Enable clang tidy warnings" OFF) option(CPPUDDLE_WITH_CLANG_FORMAT "Enable clang format target" OFF) +set(CPPUDDLE_WITH_HPX_MUTEX OFF CACHE BOOL + "Use HPX spinlock mutex instead of std::mutex") + +if(CPPUDDLE_WITH_HPX_MUTEX) + if(NOT CPPUDDLE_WITH_HPX) + message(FATAL_ERROR "CPPUDDLE_WITH_HPX_MUTEX requires a build with HPX (CPPUDDLE_WITH_HPX=ON") + endif() +endif() + if (CPPUDDLE_WITH_CUDA) enable_language(CUDA) endif () @@ -117,6 +126,16 @@ $ $ ) +if(CPPUDDLE_WITH_HPX_MUTEX) + target_compile_definitions(buffer_manager PUBLIC "CPPUDDLE_HAVE_HPX_MUTEX") + target_compile_definitions(stream_manager PUBLIC "CPPUDDLE_HAVE_HPX_MUTEX") + target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_HAVE_HPX_MUTEX") + target_compile_definitions(stream_manager INTERFACE "CPPUDDLE_HAVE_HPX_MUTEX") + message(INFO "Compiling with HPX spinlock") +else() + message(INFO "Compiling with std::mutex!") +endif() + # install libs with the defitions: install(TARGETS buffer_manager EXPORT CPPuddle ) diff --git a/include/buffer_manager.hpp b/include/buffer_manager.hpp index 6ca57f1c..ee91328a 100644 --- a/include/buffer_manager.hpp +++ b/include/buffer_manager.hpp @@ -6,6 +6,7 @@ #ifndef BUFFER_MANAGER_HPP #define BUFFER_MANAGER_HPP +#include #include #include #include @@ -17,7 +18,7 @@ #include #include -#ifdef CPPUDDLE_HAVE_HPX +#if defined(CPPUDDLE_HAVE_HPX) && defined(CPPUDDLE_HAVE_HPX_MUTEX) // For builds with The HPX mutex #include #endif @@ -30,7 +31,7 @@ namespace recycler { constexpr size_t number_instances = 128; namespace detail { -#ifdef CPPUDDLE_HAVE_HPX +#if defined(CPPUDDLE_HAVE_HPX) && defined(CPPUDDLE_HAVE_HPX_MUTEX) using mutex_t = hpx::mutex; #else using mutex_t = std::mutex; @@ -334,7 +335,7 @@ class buffer_recycler { } static void init_callbacks_once(void) { assert(instance()); -#ifdef CPPUDDLE_HAVE_HPX +#if defined(CPPUDDLE_HAVE_HPX) && defined(CPPUDDLE_HAVE_HPX_MUTEX) static hpx::once_flag flag; hpx::call_once(flag, []() { #else From 5bbf6d159b83cda9531cbe7216632f1a70f9eca8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= Date: Sun, 4 Jun 2023 11:48:24 -0500 Subject: [PATCH 32/46] Reset counters and disable one aggregation reference buffer test --- CMakeLists.txt | 70 +++++++++++++++++++++++--------------- include/buffer_manager.hpp | 7 ++++ 2 files changed, 50 insertions(+), 27 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 713f2ec1..fa34af9f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -439,11 +439,11 @@ if (CPPUDDLE_WITH_TESTS) # GPU related tests if (CPPUDDLE_WITH_CUDA) - add_test(allocator_cuda_test.run allocator_cuda_test -t 4) + add_test(allocator_cuda_test.run allocator_cuda_test --hpx:threads=4) set_tests_properties(allocator_cuda_test.run PROPERTIES PROCESSORS 4 ) - add_test(stream_test.run stream_test -t 4) + add_test(stream_test.run stream_test --hpx:threads=4) set_tests_properties(stream_test.run PROPERTIES PROCESSORS 4 ) @@ -478,12 +478,12 @@ if (CPPUDDLE_WITH_TESTS) FIXTURES_REQUIRED allocator_kokkos_output PASS_REGULAR_EXPRESSION "--> Number of bad_allocs that triggered garbage collection: [ ]* 0" ) - add_test(allocator_kokkos_executor_for_loop_test.run allocator_kokkos_executor_for_loop_test -t 4) + add_test(allocator_kokkos_executor_for_loop_test.run allocator_kokkos_executor_for_loop_test --hpx:threads=4) set_tests_properties(allocator_kokkos_executor_for_loop_test.run PROPERTIES PROCESSORS 4 ) - add_test(aggregation_basic_sequential_test.run work_aggregation_test -t1 --outputfile=aggregation_basic_sequential_test.out --scenario=sequential_test) + add_test(aggregation_basic_sequential_test.run work_aggregation_test --hpx:threads=1 --outputfile=aggregation_basic_sequential_test.out --scenario=sequential_test) set_tests_properties(aggregation_basic_sequential_test.run PROPERTIES FIXTURES_SETUP aggregation_basic_sequential_test_output ) @@ -503,7 +503,7 @@ if (CPPUDDLE_WITH_TESTS) PASS_REGULAR_EXPRESSION "--> Number of buffers that were marked as used upon cleanup: [ ]* 0" ) - add_test(aggregation_basic_parallel_test.run work_aggregation_test -t4 --outputfile=aggregation_basic_parallel_test.out --scenario=sequential_test) + add_test(aggregation_basic_parallel_test.run work_aggregation_test --hpx:threads=4 --outputfile=aggregation_basic_parallel_test.out --scenario=sequential_test) set_tests_properties(aggregation_basic_parallel_test.run PROPERTIES FIXTURES_SETUP aggregation_basic_parallel_test_output PROCESSORS 4 @@ -532,7 +532,7 @@ if (CPPUDDLE_WITH_TESTS) - add_test(aggregation_interruption_test.run work_aggregation_test -t1 --outputfile=aggregation_interruption_test.out --scenario=interruption_test) + add_test(aggregation_interruption_test.run work_aggregation_test --hpx:threads=1 --outputfile=aggregation_interruption_test.out --scenario=interruption_test) set_tests_properties(aggregation_interruption_test.run PROPERTIES FIXTURES_SETUP aggregation_interruption_test_output ) @@ -549,7 +549,7 @@ if (CPPUDDLE_WITH_TESTS) - add_test(aggregation_failure_test.run work_aggregation_test -t1 --outputfile=aggregation_failure_test.out --scenario=failure_test) + add_test(aggregation_failure_test.run work_aggregation_test --hpx:threads=1 --outputfile=aggregation_failure_test.out --scenario=failure_test) set_tests_properties(aggregation_failure_test.run PROPERTIES FIXTURES_SETUP aggregation_failure_test_output ) @@ -565,7 +565,7 @@ if (CPPUDDLE_WITH_TESTS) ) - add_test(aggregation_add_pointer_test.run work_aggregation_test -t4 --outputfile=aggregation_add_pointer_test.out --scenario=pointer_add_test) + add_test(aggregation_add_pointer_test.run work_aggregation_test --hpx:threads=4 --outputfile=aggregation_add_pointer_test.out --scenario=pointer_add_test) set_tests_properties(aggregation_add_pointer_test.run PROPERTIES FIXTURES_SETUP aggregation_add_pointer_test_output PROCESSORS 4 @@ -581,7 +581,22 @@ if (CPPUDDLE_WITH_TESTS) PASS_REGULAR_EXPRESSION "--> Number of buffers that got requested from this manager: [ ]* 3" ) - add_test(aggregation_add_references_test.run work_aggregation_test -t4 --outputfile=aggregation_add_references_test.out --scenario=references_add_test) + add_test(aggregation_add_references_test_sequential.run work_aggregation_test --hpx:threads=1 --outputfile=aggregation_add_references_test_sequential.out --scenario=references_add_test) + set_tests_properties(aggregation_add_references_test_sequential.run PROPERTIES + FIXTURES_SETUP aggregation_add_references_test_sequential_output + PROCESSORS 1 + ) + add_test(aggregation_add_references_test_sequential.analyse_number_launches cat aggregation_add_references_test_sequential.out) + set_tests_properties(aggregation_add_references_test_sequential.analyse_number_launches PROPERTIES + FIXTURES_REQUIRED aggregation_add_references_test_sequential_output + PASS_REGULAR_EXPRESSION "Number add_launches=1" + ) + add_test(aggregation_add_references_test_sequential.analyse_number_buffers cat aggregation_add_references_test_sequential.out) + set_tests_properties(aggregation_add_references_test_sequential.analyse_number_buffers PROPERTIES + FIXTURES_REQUIRED aggregation_add_references_test_sequential_output + PASS_REGULAR_EXPRESSION "--> Number of buffers that got requested from this manager: [ ]* 3" + ) + add_test(aggregation_add_references_test.run work_aggregation_test --hpx:threads=4 --outputfile=aggregation_add_references_test.out --scenario=references_add_test) set_tests_properties(aggregation_add_references_test.run PROPERTIES FIXTURES_SETUP aggregation_add_references_test_output PROCESSORS 4 @@ -591,11 +606,12 @@ if (CPPUDDLE_WITH_TESTS) FIXTURES_REQUIRED aggregation_add_references_test_output PASS_REGULAR_EXPRESSION "Number add_launches=1" ) - add_test(aggregation_add_references_test.analyse_number_buffers cat aggregation_add_references_test.out) - set_tests_properties(aggregation_add_references_test.analyse_number_buffers PROPERTIES - FIXTURES_REQUIRED aggregation_add_references_test_output - PASS_REGULAR_EXPRESSION "--> Number of buffers that got requested from this manager: [ ]* 3" - ) + # TODO Re-enable test as soon as we have aggregated counters... + # add_test(aggregation_add_references_test.analyse_number_buffers cat aggregation_add_references_test.out) + # set_tests_properties(aggregation_add_references_test.analyse_number_buffers PROPERTIES + # FIXTURES_REQUIRED aggregation_add_references_test_output + # PASS_REGULAR_EXPRESSION "--> Number of buffers that got requested from this manager: [ ]* 3" + # ) # STREAM TESTS CPU @@ -606,7 +622,7 @@ if (CPPUDDLE_WITH_TESTS) message(STATUS "Deadlock check repetitions set to ${deadlock_check_repetitions}") # Try with few slices -- good to detect deadlocking on errors with the continuations - add_test(aggregation_stream_triad_cpu_eager_test1.run work_aggregation_cpu_triad -t 4 --number_aggregation_executors=1 --number_underlying_executors=2048 --problem_size=25600 --kernel_size=256 --max_slices=2 --repetitions=${deadlock_check_repetitions} --executor_type=EAGER --outputfile=aggregation_stream_triad_cpu_eager_test1.out) + add_test(aggregation_stream_triad_cpu_eager_test1.run work_aggregation_cpu_triad --hpx:threads=4 --number_aggregation_executors=1 --number_underlying_executors=2048 --problem_size=25600 --kernel_size=256 --max_slices=2 --repetitions=${deadlock_check_repetitions} --executor_type=EAGER --outputfile=aggregation_stream_triad_cpu_eager_test1.out) set_tests_properties(aggregation_stream_triad_cpu_eager_test1.run PROPERTIES FIXTURES_SETUP aggregation_stream_triad_cpu_eager_test_output1 PROCESSORS 4 @@ -626,7 +642,7 @@ if (CPPUDDLE_WITH_TESTS) # Try with odd number of slices # This would deadlock given the STRICT executor, the EAGER one should have no problem - add_test(aggregation_stream_triad_cpu_eager_test2.run work_aggregation_cpu_triad -t 4 --number_aggregation_executors=1 --number_underlying_executors=2048 --problem_size=25600 --kernel_size=256 --max_slices=17 --repetitions=${deadlock_check_repetitions} --executor_type=EAGER --outputfile=aggregation_stream_triad_cpu_eager_test2.out) + add_test(aggregation_stream_triad_cpu_eager_test2.run work_aggregation_cpu_triad --hpx:threads=4 --number_aggregation_executors=1 --number_underlying_executors=2048 --problem_size=25600 --kernel_size=256 --max_slices=17 --repetitions=${deadlock_check_repetitions} --executor_type=EAGER --outputfile=aggregation_stream_triad_cpu_eager_test2.out) set_tests_properties(aggregation_stream_triad_cpu_eager_test2.run PROPERTIES FIXTURES_SETUP aggregation_stream_triad_cpu_eager_test_output2 PROCESSORS 4 @@ -645,7 +661,7 @@ if (CPPUDDLE_WITH_TESTS) # Try with large number of slices -- this is basically what should be used in production, hence it should be tested - add_test(aggregation_stream_triad_cpu_eager_test3.run work_aggregation_cpu_triad -t 4 --number_aggregation_executors=1 --number_underlying_executors=2048 --problem_size=25600 --kernel_size=256 --max_slices=100 --repetitions=${deadlock_check_repetitions} --executor_type=EAGER --outputfile=aggregation_stream_triad_cpu_eager_test3.out) + add_test(aggregation_stream_triad_cpu_eager_test3.run work_aggregation_cpu_triad --hpx:threads=4 --number_aggregation_executors=1 --number_underlying_executors=2048 --problem_size=25600 --kernel_size=256 --max_slices=100 --repetitions=${deadlock_check_repetitions} --executor_type=EAGER --outputfile=aggregation_stream_triad_cpu_eager_test3.out) set_tests_properties(aggregation_stream_triad_cpu_eager_test3.run PROPERTIES FIXTURES_SETUP aggregation_stream_triad_cpu_eager_test_output3 PROCESSORS 4 @@ -664,7 +680,7 @@ if (CPPUDDLE_WITH_TESTS) # Basic test for the ENDLESS executor -- number slices should not matter here, hence the large value for it - add_test(aggregation_stream_triad_cpu_endless_test1.run work_aggregation_cpu_triad -t 4 --number_aggregation_executors=1 --number_underlying_executors=2048 --problem_size=25600 --kernel_size=256 --max_slices=99999999 --repetitions=${deadlock_check_repetitions} --executor_type=ENDLESS --outputfile=aggregation_stream_triad_cpu_endless_test1.out) + add_test(aggregation_stream_triad_cpu_endless_test1.run work_aggregation_cpu_triad --hpx:threads=4 --number_aggregation_executors=1 --number_underlying_executors=2048 --problem_size=25600 --kernel_size=256 --max_slices=99999999 --repetitions=${deadlock_check_repetitions} --executor_type=ENDLESS --outputfile=aggregation_stream_triad_cpu_endless_test1.out) set_tests_properties(aggregation_stream_triad_cpu_endless_test1.run PROPERTIES FIXTURES_SETUP aggregation_stream_triad_cpu_endless_test_output1 PROCESSORS 4 @@ -683,7 +699,7 @@ if (CPPUDDLE_WITH_TESTS) # Basic test for the STRICT executor - add_test(aggregation_stream_triad_cpu_strict_test1.run work_aggregation_cpu_triad -t 4 --number_aggregation_executors=1 --number_underlying_executors=2048 --problem_size=25600 --kernel_size=256 --max_slices=100 --repetitions=${deadlock_check_repetitions} --executor_type=STRICT --outputfile=aggregation_stream_triad_cpu_strict_test1.out) + add_test(aggregation_stream_triad_cpu_strict_test1.run work_aggregation_cpu_triad --hpx:threads=4 --number_aggregation_executors=1 --number_underlying_executors=2048 --problem_size=25600 --kernel_size=256 --max_slices=100 --repetitions=${deadlock_check_repetitions} --executor_type=STRICT --outputfile=aggregation_stream_triad_cpu_strict_test1.out) set_tests_properties(aggregation_stream_triad_cpu_strict_test1.run PROPERTIES FIXTURES_SETUP aggregation_stream_triad_cpu_strict_test_output1 PROCESSORS 4 @@ -702,7 +718,7 @@ if (CPPUDDLE_WITH_TESTS) # STRICT number of kernel launches should always be same -- hence we can check the aggregation working correctly here -- here it should be exactly 200 (no aggregation happening) - add_test(aggregation_stream_triad_cpu_strict_aggregation_test1.run work_aggregation_cpu_triad -t 4 --number_aggregation_executors=1 --number_underlying_executors=2048 --problem_size=25600 --kernel_size=256 --max_slices=1 --repetitions=2 --executor_type=STRICT --outputfile=aggregation_stream_triad_cpu_strict_aggregation_test1.out) + add_test(aggregation_stream_triad_cpu_strict_aggregation_test1.run work_aggregation_cpu_triad --hpx:threads=4 --number_aggregation_executors=1 --number_underlying_executors=2048 --problem_size=25600 --kernel_size=256 --max_slices=1 --repetitions=2 --executor_type=STRICT --outputfile=aggregation_stream_triad_cpu_strict_aggregation_test1.out) set_tests_properties(aggregation_stream_triad_cpu_strict_aggregation_test1.run PROPERTIES FIXTURES_SETUP aggregation_stream_triad_cpu_strict_aggregation_test_output1 PROCESSORS 4 @@ -726,7 +742,7 @@ if (CPPUDDLE_WITH_TESTS) # STRICT number of kernel launches should always be same -- hence we can check the aggregation working correctly here -- here it should be exactly 30 - add_test(aggregation_stream_triad_cpu_strict_aggregation_test2.run work_aggregation_cpu_triad -t 4 --number_aggregation_executors=1 --number_underlying_executors=2048 --problem_size=25600 --kernel_size=256 --max_slices=10 --repetitions=3 --executor_type=STRICT --outputfile=aggregation_stream_triad_cpu_strict_aggregation_test2.out) + add_test(aggregation_stream_triad_cpu_strict_aggregation_test2.run work_aggregation_cpu_triad --hpx:threads=4 --number_aggregation_executors=1 --number_underlying_executors=2048 --problem_size=25600 --kernel_size=256 --max_slices=10 --repetitions=3 --executor_type=STRICT --outputfile=aggregation_stream_triad_cpu_strict_aggregation_test2.out) set_tests_properties(aggregation_stream_triad_cpu_strict_aggregation_test2.run PROPERTIES FIXTURES_SETUP aggregation_stream_triad_cpu_strict_aggregation_test_output2 PROCESSORS 4 @@ -750,7 +766,7 @@ if (CPPUDDLE_WITH_TESTS) # STRICT number of kernel launches should always be same -- hence we can check the aggregation working correctly here -- here it should be exactly 1 - add_test(aggregation_stream_triad_cpu_strict_aggregation_test3.run work_aggregation_cpu_triad -t 4 --number_aggregation_executors=1 --number_underlying_executors=2048 --problem_size=25600 --kernel_size=256 --max_slices=100 --repetitions=1 --executor_type=STRICT --outputfile=aggregation_stream_triad_cpu_strict_aggregation_test3.out) + add_test(aggregation_stream_triad_cpu_strict_aggregation_test3.run work_aggregation_cpu_triad --hpx:threads=4 --number_aggregation_executors=1 --number_underlying_executors=2048 --problem_size=25600 --kernel_size=256 --max_slices=100 --repetitions=1 --executor_type=STRICT --outputfile=aggregation_stream_triad_cpu_strict_aggregation_test3.out) set_tests_properties(aggregation_stream_triad_cpu_strict_aggregation_test3.run PROPERTIES FIXTURES_SETUP aggregation_stream_triad_cpu_strict_aggregation_test_output3 PROCESSORS 4 @@ -775,7 +791,7 @@ if (CPPUDDLE_WITH_TESTS) # STREAM TESTS CUDA # Try with few slices -- good to detect deadlocking on errors with the continuations - add_test(aggregation_stream_triad_cuda_eager_test1.run work_aggregation_cuda_triad -t 4 --number_aggregation_executors=1 --number_underlying_executors=4 --problem_size=102400 --kernel_size=1024 --max_slices=2 --repetitions=${deadlock_check_repetitions} --executor_type=EAGER --outputfile=aggregation_stream_triad_cuda_eager_test1.out) + add_test(aggregation_stream_triad_cuda_eager_test1.run work_aggregation_cuda_triad --hpx:threads=4 --number_aggregation_executors=1 --number_underlying_executors=4 --problem_size=102400 --kernel_size=1024 --max_slices=2 --repetitions=${deadlock_check_repetitions} --executor_type=EAGER --outputfile=aggregation_stream_triad_cuda_eager_test1.out) set_tests_properties(aggregation_stream_triad_cuda_eager_test1.run PROPERTIES FIXTURES_SETUP aggregation_stream_triad_cuda_eager_test_output1 PROCESSORS 4 @@ -795,7 +811,7 @@ if (CPPUDDLE_WITH_TESTS) # Try with odd number of slices # This would deadlock given the STRICT executor, the EAGER one should have no problem - add_test(aggregation_stream_triad_cuda_eager_test2.run work_aggregation_cuda_triad -t 4 --number_aggregation_executors=1 --number_underlying_executors=4 --problem_size=102400 --kernel_size=1024 --max_slices=17 --repetitions=${deadlock_check_repetitions} --executor_type=EAGER --outputfile=aggregation_stream_triad_cuda_eager_test2.out) + add_test(aggregation_stream_triad_cuda_eager_test2.run work_aggregation_cuda_triad --hpx:threads=4 --number_aggregation_executors=1 --number_underlying_executors=4 --problem_size=102400 --kernel_size=1024 --max_slices=17 --repetitions=${deadlock_check_repetitions} --executor_type=EAGER --outputfile=aggregation_stream_triad_cuda_eager_test2.out) set_tests_properties(aggregation_stream_triad_cuda_eager_test2.run PROPERTIES FIXTURES_SETUP aggregation_stream_triad_cuda_eager_test_output2 PROCESSORS 4 @@ -814,7 +830,7 @@ if (CPPUDDLE_WITH_TESTS) # Try with large number of slices -- this is basically what should be used in production, hence it should be tested - add_test(aggregation_stream_triad_cuda_eager_test3.run work_aggregation_cuda_triad -t 4 --number_aggregation_executors=1 --number_underlying_executors=4 --problem_size=102400 --kernel_size=1024 --max_slices=100 --repetitions=${deadlock_check_repetitions} --executor_type=EAGER --outputfile=aggregation_stream_triad_cuda_eager_test3.out) + add_test(aggregation_stream_triad_cuda_eager_test3.run work_aggregation_cuda_triad --hpx:threads=4 --number_aggregation_executors=1 --number_underlying_executors=4 --problem_size=102400 --kernel_size=1024 --max_slices=100 --repetitions=${deadlock_check_repetitions} --executor_type=EAGER --outputfile=aggregation_stream_triad_cuda_eager_test3.out) set_tests_properties(aggregation_stream_triad_cuda_eager_test3.run PROPERTIES FIXTURES_SETUP aggregation_stream_triad_cuda_eager_test_output3 PROCESSORS 4 @@ -833,7 +849,7 @@ if (CPPUDDLE_WITH_TESTS) # Basic test for the ENDLESS executor -- number slices should not matter here, hence the large value for it - add_test(aggregation_stream_triad_cuda_endless_test1.run work_aggregation_cuda_triad -t 4 --number_aggregation_executors=1 --number_underlying_executors=4 --problem_size=102400 --kernel_size=1024 --max_slices=99999999 --repetitions=${deadlock_check_repetitions} --executor_type=ENDLESS --outputfile=aggregation_stream_triad_cuda_endless_test1.out) + add_test(aggregation_stream_triad_cuda_endless_test1.run work_aggregation_cuda_triad --hpx:threads=4 --number_aggregation_executors=1 --number_underlying_executors=4 --problem_size=102400 --kernel_size=1024 --max_slices=99999999 --repetitions=${deadlock_check_repetitions} --executor_type=ENDLESS --outputfile=aggregation_stream_triad_cuda_endless_test1.out) set_tests_properties(aggregation_stream_triad_cuda_endless_test1.run PROPERTIES FIXTURES_SETUP aggregation_stream_triad_cuda_endless_test_output1 PROCESSORS 4 @@ -852,7 +868,7 @@ if (CPPUDDLE_WITH_TESTS) # Basic test for the STRICT executor - add_test(aggregation_stream_triad_cuda_strict_test1.run work_aggregation_cuda_triad -t 4 --number_aggregation_executors=1 --number_underlying_executors=4 --problem_size=102400 --kernel_size=1024 --max_slices=100 --repetitions=${deadlock_check_repetitions} --executor_type=STRICT --outputfile=aggregation_stream_triad_cuda_strict_test1.out) + add_test(aggregation_stream_triad_cuda_strict_test1.run work_aggregation_cuda_triad --hpx:threads=4 --number_aggregation_executors=1 --number_underlying_executors=4 --problem_size=102400 --kernel_size=1024 --max_slices=100 --repetitions=${deadlock_check_repetitions} --executor_type=STRICT --outputfile=aggregation_stream_triad_cuda_strict_test1.out) set_tests_properties(aggregation_stream_triad_cuda_strict_test1.run PROPERTIES FIXTURES_SETUP aggregation_stream_triad_cuda_strict_test_output1 PROCESSORS 4 diff --git a/include/buffer_manager.hpp b/include/buffer_manager.hpp index ee91328a..9f3b1751 100644 --- a/include/buffer_manager.hpp +++ b/include/buffer_manager.hpp @@ -414,6 +414,13 @@ class buffer_recycler { #endif unused_buffer_list.clear(); buffer_map.clear(); +#ifdef CPPUDDLE_HAVE_COUNTERS + number_allocation = 0; + number_recycling = 0; + number_bad_alloc = 0; + number_creation = 0; + number_wrong_hints = 0; +#endif } public: ~buffer_manager() { From 45dc728561281826f4220aee70a59b24e9ab2824 Mon Sep 17 00:00:00 2001 From: Gregor Daiss Date: Sun, 4 Jun 2023 13:24:59 -0500 Subject: [PATCH 33/46] Use spinlock --- CMakeLists.txt | 2 -- include/buffer_manager.hpp | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index fa34af9f..3b3ada5b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -127,8 +127,6 @@ $ ) if(CPPUDDLE_WITH_HPX_MUTEX) - target_compile_definitions(buffer_manager PUBLIC "CPPUDDLE_HAVE_HPX_MUTEX") - target_compile_definitions(stream_manager PUBLIC "CPPUDDLE_HAVE_HPX_MUTEX") target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_HAVE_HPX_MUTEX") target_compile_definitions(stream_manager INTERFACE "CPPUDDLE_HAVE_HPX_MUTEX") message(INFO "Compiling with HPX spinlock") diff --git a/include/buffer_manager.hpp b/include/buffer_manager.hpp index 9f3b1751..0596cfab 100644 --- a/include/buffer_manager.hpp +++ b/include/buffer_manager.hpp @@ -32,7 +32,7 @@ constexpr size_t number_instances = 128; namespace detail { #if defined(CPPUDDLE_HAVE_HPX) && defined(CPPUDDLE_HAVE_HPX_MUTEX) -using mutex_t = hpx::mutex; +using mutex_t = hpx::spinlock; #else using mutex_t = std::mutex; #endif From ffd4a4b99b6154ac8fc4eca6d5b64226f5842714 Mon Sep 17 00:00:00 2001 From: Gregor Daiss Date: Sun, 4 Jun 2023 13:40:20 -0500 Subject: [PATCH 34/46] Use mutex type consistently --- include/aggregation_manager.hpp | 6 +++++- include/stream_manager.hpp | 25 ++++++++++++++++++------- 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/include/aggregation_manager.hpp b/include/aggregation_manager.hpp index 4441c235..5fdda226 100644 --- a/include/aggregation_manager.hpp +++ b/include/aggregation_manager.hpp @@ -42,7 +42,11 @@ #include "../include/buffer_manager.hpp" #include "../include/stream_manager.hpp" -using aggregation_mutex_t = hpx::mutex; +#if defined(CPPUDDLE_HAVE_HPX_MUTEX) +using aggregation_mutex_t = hpx::spinlock; +#else +using aggregation_mutex_t = std::mutex; +#endif //=============================================================================== //=============================================================================== diff --git a/include/stream_manager.hpp b/include/stream_manager.hpp index d3bda59d..f87503cb 100644 --- a/include/stream_manager.hpp +++ b/include/stream_manager.hpp @@ -15,6 +15,17 @@ #include #include +#if defined(CPPUDDLE_HAVE_HPX) && defined(CPPUDDLE_HAVE_HPX_MUTEX) +// For builds with The HPX mutex +#include +#endif + +#if defined(CPPUDDLE_HAVE_HPX) && defined(CPPUDDLE_HAVE_HPX_MUTEX) +using mutex_t = hpx::spinlock; +#else +using mutex_t = std::mutex; +#endif + //#include // #include // #include @@ -259,7 +270,7 @@ class stream_pool { } } static void cleanup() { - std::lock_guard guard(pool_mut); + std::lock_guard guard(pool_mut); if (pool_instance) { pool_instance->streampool.reset(nullptr); pool_instance.reset(nullptr); @@ -267,24 +278,24 @@ class stream_pool { } static std::tuple get_interface() noexcept { - std::lock_guard guard(pool_mut); + std::lock_guard guard(pool_mut); assert(pool_instance); // should already be initialized return pool_instance->streampool->get_interface(); } static void release_interface(size_t index) noexcept { - std::lock_guard guard(pool_mut); + std::lock_guard guard(pool_mut); assert(pool_instance); // should already be initialized pool_instance->streampool->release_interface(index); } static bool interface_available(size_t load_limit) noexcept { - std::lock_guard guard(pool_mut); + std::lock_guard guard(pool_mut); if (!pool_instance) { return false; } return pool_instance->streampool->interface_available(load_limit); } static size_t get_current_load() noexcept { - std::lock_guard guard(pool_mut); + std::lock_guard guard(pool_mut); if (!pool_instance) { return 0; } @@ -292,7 +303,7 @@ class stream_pool { return pool_instance->streampool->get_current_load(); } static size_t get_next_device_id() noexcept { - std::lock_guard guard(pool_mut); + std::lock_guard guard(pool_mut); if (!pool_instance) { return 0; } @@ -302,7 +313,7 @@ class stream_pool { private: inline static std::unique_ptr pool_instance{}; stream_pool_implementation() = default; - inline static std::mutex pool_mut{}; + inline static mutex_t pool_mut{}; std::unique_ptr streampool{nullptr}; From 30ef9738937c8003246e9a98b842d668b619d6ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= Date: Mon, 5 Jun 2023 13:46:58 -0500 Subject: [PATCH 35/46] Add possiblity to disable buffer recycling at compiletime --- CMakeLists.txt | 32 ++++++++++++++++++++------------ include/buffer_manager.hpp | 16 ++++++++++++++++ 2 files changed, 36 insertions(+), 12 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3b3ada5b..0d404ad0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -22,13 +22,14 @@ option(CPPUDDLE_WITH_HPX "Enable HPX integration and examples" OFF) option(CPPUDDLE_WITH_KOKKOS "Enable KOKKOS tests/examples" OFF) option(CPPUDDLE_WITH_CLANG_TIDY "Enable clang tidy warnings" OFF) option(CPPUDDLE_WITH_CLANG_FORMAT "Enable clang format target" OFF) +option(CPPPUDDLE_DEACTIVATE_BUFFER_RECYCLING "Deactivates the default recycling behaviour" OFF) set(CPPUDDLE_WITH_HPX_MUTEX OFF CACHE BOOL "Use HPX spinlock mutex instead of std::mutex") if(CPPUDDLE_WITH_HPX_MUTEX) if(NOT CPPUDDLE_WITH_HPX) - message(FATAL_ERROR "CPPUDDLE_WITH_HPX_MUTEX requires a build with HPX (CPPUDDLE_WITH_HPX=ON") + message(FATAL_ERROR " CPPUDDLE_WITH_HPX_MUTEX requires a build with HPX (CPPUDDLE_WITH_HPX=ON") endif() endif() @@ -49,7 +50,7 @@ if (CPPUDDLE_WITH_KOKKOS) # Check that everything required is actyivated if (NOT CPPUDDLE_WITH_HPX) - message(FATAL_ERROR "KOKKOS support requires HPX flag to be turned on") + message(FATAL_ERROR " KOKKOS support requires HPX flag to be turned on") endif() #if (NOT CPPUDDLE_WITH_CUDA AND NOT CPPUDDLE_WITH_HIP) # message(FATAL_ERROR "KOKKOS support requires CUDA flag to be turned on") @@ -58,15 +59,15 @@ if (CPPUDDLE_WITH_KOKKOS) # Check that Kokkos and HPX options are consistent. if(Kokkos_ENABLE_CUDA) if(NOT HPX_WITH_CUDA) - message(FATAL_ERROR "Kokkos was built with CUDA support, HPX was not") + message(FATAL_ERROR " Kokkos was built with CUDA support, HPX was not") endif() kokkos_check(OPTIONS CUDA_LAMBDA) if(NOT HPX_WITH_CUDA) - message(FATAL_ERROR "Kokkos was built with CUDA support, HPX was not") + message(FATAL_ERROR " Kokkos was built with CUDA support, HPX was not") endif() else() if(HPX_WITH_CUDA) - message(FATAL_ERROR "HPX was built with CUDA support, Kokkos was not") + message(FATAL_ERROR " HPX was built with CUDA support, Kokkos was not") endif() endif() @@ -129,9 +130,16 @@ $ if(CPPUDDLE_WITH_HPX_MUTEX) target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_HAVE_HPX_MUTEX") target_compile_definitions(stream_manager INTERFACE "CPPUDDLE_HAVE_HPX_MUTEX") - message(INFO "Compiling with HPX spinlock") + message(INFO " Compiling with HPX spinlock") else() - message(INFO "Compiling with std::mutex!") + message(INFO " Compiling with std::mutex!") +endif() + +if(CPPPUDDLE_DEACTIVATE_BUFFER_RECYCLING) + target_compile_definitions(buffer_manager INTERFACE "CPPPUDDLE_DEACTIVATE_BUFFER_RECYCLING") + message(WARNING " Slow Build: Buffer recycling is deactivated. This should only be used for performance tests!") +else() + message(INFO " Using default buffer recycling behaviour.") endif() # install libs with the defitions: @@ -793,7 +801,7 @@ if (CPPUDDLE_WITH_TESTS) set_tests_properties(aggregation_stream_triad_cuda_eager_test1.run PROPERTIES FIXTURES_SETUP aggregation_stream_triad_cuda_eager_test_output1 PROCESSORS 4 - TIMEOUT 600 + TIMEOUT 1200 ) add_test(aggregation_stream_triad_cuda_eager_test1.check_errors cat aggregation_stream_triad_cuda_eager_test1.out) set_tests_properties(aggregation_stream_triad_cuda_eager_test1.check_errors PROPERTIES @@ -813,7 +821,7 @@ if (CPPUDDLE_WITH_TESTS) set_tests_properties(aggregation_stream_triad_cuda_eager_test2.run PROPERTIES FIXTURES_SETUP aggregation_stream_triad_cuda_eager_test_output2 PROCESSORS 4 - TIMEOUT 600 + TIMEOUT 1200 ) add_test(aggregation_stream_triad_cuda_eager_test2.check_errors cat aggregation_stream_triad_cuda_eager_test2.out) set_tests_properties(aggregation_stream_triad_cuda_eager_test2.check_errors PROPERTIES @@ -832,7 +840,7 @@ if (CPPUDDLE_WITH_TESTS) set_tests_properties(aggregation_stream_triad_cuda_eager_test3.run PROPERTIES FIXTURES_SETUP aggregation_stream_triad_cuda_eager_test_output3 PROCESSORS 4 - TIMEOUT 600 + TIMEOUT 1200 ) add_test(aggregation_stream_triad_cuda_eager_test3.check_errors cat aggregation_stream_triad_cuda_eager_test3.out) set_tests_properties(aggregation_stream_triad_cuda_eager_test3.check_errors PROPERTIES @@ -851,7 +859,7 @@ if (CPPUDDLE_WITH_TESTS) set_tests_properties(aggregation_stream_triad_cuda_endless_test1.run PROPERTIES FIXTURES_SETUP aggregation_stream_triad_cuda_endless_test_output1 PROCESSORS 4 - TIMEOUT 600 + TIMEOUT 1200 ) add_test(aggregation_stream_triad_cuda_endless_test1.check_errors cat aggregation_stream_triad_cuda_endless_test1.out) set_tests_properties(aggregation_stream_triad_cuda_endless_test1.check_errors PROPERTIES @@ -870,7 +878,7 @@ if (CPPUDDLE_WITH_TESTS) set_tests_properties(aggregation_stream_triad_cuda_strict_test1.run PROPERTIES FIXTURES_SETUP aggregation_stream_triad_cuda_strict_test_output1 PROCESSORS 4 - TIMEOUT 600 + TIMEOUT 1200 ) add_test(aggregation_stream_triad_cuda_strict_test1.check_errors cat aggregation_stream_triad_cuda_strict_test1.out) set_tests_properties(aggregation_stream_triad_cuda_strict_test1.check_errors PROPERTIES diff --git a/include/buffer_manager.hpp b/include/buffer_manager.hpp index 0596cfab..7a23eb4c 100644 --- a/include/buffer_manager.hpp +++ b/include/buffer_manager.hpp @@ -40,6 +40,21 @@ using mutex_t = std::mutex; class buffer_recycler { // Public interface public: +#if defined(CPPPUDDLE_DEACTIVATE_BUFFER_RECYCLING) +#pragma message \ + "Warning: Running build without buffer recycling! Use only for performance testing!" + template + static T *get(size_t number_elements, bool manage_content_lifetime = false, + std::optional location_hint = std::nullopt) { + return Host_Allocator{}.allocate(number_elements); + } + /// Marks an buffer as unused and fit for reusage + template + static void mark_unused(T *p, size_t number_elements, + std::optional location_hint = std::nullopt) { + return Host_Allocator{}.deallocate(p, number_elements); + } +#else /// Returns and allocated buffer of the requested size - this may be a reused /// buffer template @@ -54,6 +69,7 @@ class buffer_recycler { std::optional location_hint = std::nullopt) { return buffer_manager::mark_unused(p, number_elements); } +#endif /// Deallocate all buffers, no matter whether they are marked as used or not static void clean_all() { std::lock_guard guard(instance().callback_protection_mut); From 0bb0935d9f1a369460b78bec4460c75a97c0138f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= Date: Mon, 5 Jun 2023 14:09:31 -0500 Subject: [PATCH 36/46] Fix typo --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0d404ad0..bd3dc292 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -22,7 +22,7 @@ option(CPPUDDLE_WITH_HPX "Enable HPX integration and examples" OFF) option(CPPUDDLE_WITH_KOKKOS "Enable KOKKOS tests/examples" OFF) option(CPPUDDLE_WITH_CLANG_TIDY "Enable clang tidy warnings" OFF) option(CPPUDDLE_WITH_CLANG_FORMAT "Enable clang format target" OFF) -option(CPPPUDDLE_DEACTIVATE_BUFFER_RECYCLING "Deactivates the default recycling behaviour" OFF) +option(CPPUDDLE_DEACTIVATE_BUFFER_RECYCLING "Deactivates the default recycling behaviour" OFF) set(CPPUDDLE_WITH_HPX_MUTEX OFF CACHE BOOL "Use HPX spinlock mutex instead of std::mutex") From 2d1bdfb54d86dd3069bab12e2b664eb1c1f44db0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= Date: Mon, 5 Jun 2023 14:15:12 -0500 Subject: [PATCH 37/46] Fix typo everywhere --- CMakeLists.txt | 6 +++--- include/buffer_manager.hpp | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index bd3dc292..31c76609 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -135,11 +135,11 @@ else() message(INFO " Compiling with std::mutex!") endif() -if(CPPPUDDLE_DEACTIVATE_BUFFER_RECYCLING) - target_compile_definitions(buffer_manager INTERFACE "CPPPUDDLE_DEACTIVATE_BUFFER_RECYCLING") +if(CPPUDDLE_DEACTIVATE_BUFFER_RECYCLING) + target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_DEACTIVATE_BUFFER_RECYCLING") message(WARNING " Slow Build: Buffer recycling is deactivated. This should only be used for performance tests!") else() - message(INFO " Using default buffer recycling behaviour.") + message(INFO " Using default buffer recycling behaviour!") endif() # install libs with the defitions: diff --git a/include/buffer_manager.hpp b/include/buffer_manager.hpp index 7a23eb4c..223907ec 100644 --- a/include/buffer_manager.hpp +++ b/include/buffer_manager.hpp @@ -40,7 +40,7 @@ using mutex_t = std::mutex; class buffer_recycler { // Public interface public: -#if defined(CPPPUDDLE_DEACTIVATE_BUFFER_RECYCLING) +#if defined(CPPUDDLE_DEACTIVATE_BUFFER_RECYCLING) #pragma message \ "Warning: Running build without buffer recycling! Use only for performance testing!" template From 87b508bbafe410de06dcc4feaf1ed11cdcc85541 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= Date: Mon, 5 Jun 2023 17:56:28 -0500 Subject: [PATCH 38/46] Add flag to disable aggressive allocators --- CMakeLists.txt | 6 ++++++ include/aggregation_manager.hpp | 2 +- include/buffer_manager.hpp | 4 +++- include/hpx_buffer_util.hpp | 2 ++ 4 files changed, 12 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 31c76609..dd169243 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -23,6 +23,7 @@ option(CPPUDDLE_WITH_KOKKOS "Enable KOKKOS tests/examples" OFF) option(CPPUDDLE_WITH_CLANG_TIDY "Enable clang tidy warnings" OFF) option(CPPUDDLE_WITH_CLANG_FORMAT "Enable clang format target" OFF) option(CPPUDDLE_DEACTIVATE_BUFFER_RECYCLING "Deactivates the default recycling behaviour" OFF) +option(CPPUDDLE_DEACTIVATE_AGGRESSIVE_ALLOCATORS "Deactivates the aggressive allocators" OFF) set(CPPUDDLE_WITH_HPX_MUTEX OFF CACHE BOOL "Use HPX spinlock mutex instead of std::mutex") @@ -142,6 +143,11 @@ else() message(INFO " Using default buffer recycling behaviour!") endif() +if(CPPUDDLE_DEACTIVATE_AGGRESSIVE_ALLOCATORS) + target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_DEACTIVATE_AGGRESSIVE_ALLOCATORS") + message(WARNING " Slow Build: Aggressive allocators disabled. This should only be used for performance tests!") +endif() + # install libs with the defitions: install(TARGETS buffer_manager EXPORT CPPuddle ) diff --git a/include/aggregation_manager.hpp b/include/aggregation_manager.hpp index 5fdda226..6b5a916e 100644 --- a/include/aggregation_manager.hpp +++ b/include/aggregation_manager.hpp @@ -138,7 +138,7 @@ template class aggregated_function_call { #if !(defined(NDEBUG)) && defined(DEBUG_AGGREGATION_CALLS) #pragma message \ - "Running slow work aggegator debug build! Run with NDEBUG defined for fast build..." + "Building slow work aggegator build with additional runtime checks! Build with NDEBUG defined for fast build..." /// Stores the function call of the first slice as reference for error /// checking std::any function_tuple; diff --git a/include/buffer_manager.hpp b/include/buffer_manager.hpp index 223907ec..cb4cf4c3 100644 --- a/include/buffer_manager.hpp +++ b/include/buffer_manager.hpp @@ -42,7 +42,7 @@ class buffer_recycler { public: #if defined(CPPUDDLE_DEACTIVATE_BUFFER_RECYCLING) #pragma message \ - "Warning: Running build without buffer recycling! Use only for performance testing!" + "Warning: Building without buffer recycling! Use only for performance testing!" template static T *get(size_t number_elements, bool manage_content_lifetime = false, std::optional location_hint = std::nullopt) { @@ -515,6 +515,7 @@ struct aggressive_recycle_allocator { void deallocate(T *p, std::size_t n) { buffer_recycler::mark_unused(p, n); } +#ifndef CPPUDDLE_DEACTIVATE_AGGRESSIVE_ALLOCATORS template inline void construct(T *p, Args... args) noexcept { // Do nothing here - we reuse the content of the last owner @@ -523,6 +524,7 @@ struct aggressive_recycle_allocator { // Do nothing here - Contents will be destroyed when the buffer manager is // destroyed, not before } +#endif }; template constexpr bool diff --git a/include/hpx_buffer_util.hpp b/include/hpx_buffer_util.hpp index 54c52d1f..f1b8cdd6 100644 --- a/include/hpx_buffer_util.hpp +++ b/include/hpx_buffer_util.hpp @@ -76,6 +76,7 @@ struct numa_aware_aggressive_recycle_allocator { void deallocate(T *p, std::size_t n) { buffer_recycler::mark_unused(p, n, dealloc_hint); } +#ifndef CPPUDDLE_DEACTIVATE_AGGRESSIVE_ALLOCATORS template inline void construct(T *p, Args... args) noexcept { // Do nothing here - we reuse the content of the last owner @@ -84,6 +85,7 @@ struct numa_aware_aggressive_recycle_allocator { // Do nothing here - Contents will be destroyed when the buffer manager is // destroyed, not before } +#endif }; template constexpr bool From 0f885febd0724529b273362099bb078b76cceb4a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= Date: Tue, 6 Jun 2023 14:10:40 -0500 Subject: [PATCH 39/46] HPX/NUMA-aware allocators are now enabled at compile time --- CMakeLists.txt | 84 ++++++++++++++++++----- include/aggregation_manager.hpp | 8 +++ include/aligned_buffer_util.hpp | 14 ---- include/buffer_manager.hpp | 78 ++++++++++++++++++++- include/cuda_buffer_util.hpp | 11 --- include/hip_buffer_util.hpp | 12 +--- include/hpx_buffer_util.hpp | 117 -------------------------------- include/sycl_buffer_util.hpp | 11 --- tests/allocator_hpx_test.cpp | 97 -------------------------- 9 files changed, 153 insertions(+), 279 deletions(-) delete mode 100644 include/hpx_buffer_util.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index dd169243..26ec306b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,42 +8,73 @@ cmake_minimum_required(VERSION 3.16) project(CPPuddle CXX C) # Some random project name set(CMAKE_CXX_STANDARD 17) +#------------------------------------------------------------------------------------------------------------ +# Version + set(CPPUDDLE_VERSION_MAJOR 0) set(CPPUDDLE_VERSION_MINOR 1) set(CPPUDDLE_VERSION_PATCH 99) set(CPPUDDLE_VERSION_STRING "${CPPUDDLE_VERSION_MAJOR}.${CPPUDDLE_VERSION_MINOR}.${CPPUDDLE_VERSION_PATCH}.") -option(CPPUDDLE_WITH_TESTS "Build tests/examples" OFF) -set(CPPUDDLE_WITH_DEADLOCK_TEST_REPETITONS "100000" CACHE STRING "Number of repetitions for the aggregation executor deadlock tests") -option(CPPUDDLE_WITH_COUNTERS "Turns on allocations counters. Useful for extended testing" OFF) +#------------------------------------------------------------------------------------------------------------ +# Define Options + +# GPU-related options option(CPPUDDLE_WITH_CUDA "Enable CUDA tests/examples" OFF) option(CPPUDDLE_WITH_MULTIGPU_SUPPORT "Enables experimental MultiGPU support" OFF) -option(CPPUDDLE_WITH_HPX "Enable HPX integration and examples" OFF) option(CPPUDDLE_WITH_KOKKOS "Enable KOKKOS tests/examples" OFF) -option(CPPUDDLE_WITH_CLANG_TIDY "Enable clang tidy warnings" OFF) -option(CPPUDDLE_WITH_CLANG_FORMAT "Enable clang format target" OFF) +# HPX-related options +option(CPPUDDLE_WITH_HPX "Enable basic HPX integration and examples" OFF) +option(CPPUDDLE_WITH_HPX_AWARE_ALLOCATORS "Enable HPX-aware allocators for even better HPX integration" ON) +set(CPPUDDLE_WITH_HPX_MUTEX OFF CACHE BOOL + "Use HPX spinlock mutex instead of std::mutex") +# Test-related options +option(CPPUDDLE_WITH_COUNTERS "Turns on allocations counters. Useful for extended testing" OFF) +option(CPPUDDLE_WITH_TESTS "Build tests/examples" OFF) +set(CPPUDDLE_WITH_DEADLOCK_TEST_REPETITONS "100000" CACHE STRING "Number of repetitions for the aggregation executor deadlock tests") option(CPPUDDLE_DEACTIVATE_BUFFER_RECYCLING "Deactivates the default recycling behaviour" OFF) option(CPPUDDLE_DEACTIVATE_AGGRESSIVE_ALLOCATORS "Deactivates the aggressive allocators" OFF) +# Tooling options +option(CPPUDDLE_WITH_CLANG_TIDY "Enable clang tidy warnings" OFF) +option(CPPUDDLE_WITH_CLANG_FORMAT "Enable clang format target" OFF) -set(CPPUDDLE_WITH_HPX_MUTEX OFF CACHE BOOL - "Use HPX spinlock mutex instead of std::mutex") +#------------------------------------------------------------------------------------------------------------ +# Define dependencies and conflicts/incompatibilities +# Find HPX for HPX-enabled builds +if (CPPUDDLE_WITH_HPX) + find_package(HPX 1.8.0 REQUIRED) # older versions might work but are untested with the current cppuddle +endif() + +# HPX mutex requires HPX-Support if(CPPUDDLE_WITH_HPX_MUTEX) if(NOT CPPUDDLE_WITH_HPX) - message(FATAL_ERROR " CPPUDDLE_WITH_HPX_MUTEX requires a build with HPX (CPPUDDLE_WITH_HPX=ON") + message(FATAL_ERROR " CPPUDDLE_WITH_HPX_MUTEX requires a build with HPX (CPPUDDLE_WITH_HPX=ON)") endif() endif() +# HPX build are really better with HPX-aware allocators: Warn if disabled +if(CPPUDDLE_WITH_HPX) + if(NOT CPPUDDLE_WITH_HPX_AWARE_ALLOCATORS) + message(WARNING " CPPUDDLE_WITH_HPX_AWARE_ALLOCATORS=ON is recommended for HPX builds \ +(currently OFF even though CPPUDDLE_WITH_HPX=ON). Performance negatively impacted!") + endif() +endif() + +# HPX-aware allocators require HPX-Support. Warn if HPX support is disabled as we fallback on non-aware +# allocators +if(NOT CPPUDDLE_WITH_HPX) + if(CPPUDDLE_WITH_HPX_AWARE_ALLOCATORS) + message(WARNING " CPPUDDLE_WITH_HPX_AWARE_ALLOCATORS=ON even though CPPUDDLE_WITH_HPX=OFF. \ +Falling back on non-hpx-aware allocators which will negatively impact performance!") + endif() +endif() + +# GPU options -- mostly for tests actually, otherwise simply including the the headers +# offers users all functionality regardless of these options if (CPPUDDLE_WITH_CUDA) enable_language(CUDA) endif () - -if (CPPUDDLE_WITH_HPX) - find_package(HPX REQUIRED) -endif() -if (CPPUDDLE_WITH_TESTS) - find_package(Boost REQUIRED program_options) -endif() if (CPPUDDLE_WITH_KOKKOS) # Find packages find_package(Kokkos 3.0.0 REQUIRED) @@ -75,6 +106,11 @@ if (CPPUDDLE_WITH_KOKKOS) kokkos_check(DEVICES HPX) endif() +# For builds with tests we need Boost for the program_options +if (CPPUDDLE_WITH_TESTS) + find_package(Boost REQUIRED program_options) +endif() + # Add Linter warnings if (CPPUDDLE_WITH_CLANG_TIDY) find_program(CLANG_TIDY "clang-tidy") @@ -102,11 +138,18 @@ if (CPPUDDLE_WITH_CLANG_FORMAT) endif() endif() +#------------------------------------------------------------------------------------------------------------ +# Define library targets and installation +# (also includes various warnings for non-optimal build configurations) ## Interface targets add_library(buffer_manager INTERFACE) if (CPPUDDLE_WITH_HPX) target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_HAVE_HPX") + if(CPPUDDLE_WITH_HPX_AWARE_ALLOCATORS) + message(INFO " Compiling HPX-aware allocators!") + target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS") + endif() endif() if (CPPUDDLE_WITH_COUNTERS) target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_HAVE_COUNTERS") @@ -119,6 +162,9 @@ target_include_directories(buffer_manager INTERFACE add_library(stream_manager INTERFACE) if (CPPUDDLE_WITH_HPX) target_compile_definitions(stream_manager INTERFACE "CPPUDDLE_HAVE_HPX") + if(CPPUDDLE_WITH_HPX_AWARE_ALLOCATORS) + target_compile_definitions(stream_manager INTERFACE "CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS") + endif() endif() if (CPPUDDLE_WITH_COUNTERS) target_compile_definitions(stream_manager INTERFACE "CPPUDDLE_HAVE_COUNTERS") @@ -161,6 +207,9 @@ install( install(FILES cppuddle-config.cmake DESTINATION ${CMAKE_INSTALL_PREFIX}/lib/cmake/CPPuddle/) install(EXPORT CPPuddle NAMESPACE CPPuddle:: DESTINATION ${CMAKE_INSTALL_PREFIX}/lib/cmake/CPPuddle/) +#------------------------------------------------------------------------------------------------------------ +# Define cmake targets for all tests/example executables + ## Add target for tests and tests definitions if (CPPUDDLE_WITH_TESTS) add_executable(allocator_test tests/allocator_test.cpp) @@ -275,6 +324,9 @@ if (CPPUDDLE_WITH_TESTS) message(WARNING, " Multi-GPU Support not yet properly tested!") endif() +#------------------------------------------------------------------------------------------------------------ +# Define actual tests (usually running the binary and checking its output for certain patterns via regex) + enable_testing() # Basic functionality tests diff --git a/include/aggregation_manager.hpp b/include/aggregation_manager.hpp index 6b5a916e..8c51c182 100644 --- a/include/aggregation_manager.hpp +++ b/include/aggregation_manager.hpp @@ -556,8 +556,16 @@ template class Aggregated_Executor { if (buffer_counter <= slice_alloc_counter) { constexpr bool manage_content_lifetime = false; buffers_in_use = true; +#ifndef CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS + // Deactivated HPX-aware allocation... + const size_t location_id = 0; + // not recommended for performance +#pragma message \ +"Warning: Running work aggregation without HPX-aware allocators enabled. Performance negatively impacted!" +#else // get prefered location: aka the current hpx threads location const size_t location_id = hpx::get_worker_thread_num(); +#endif // Get shiny and new buffer that will be shared between all slices // Buffer might be recycled from previous allocations by the // buffer_recycler... diff --git a/include/aligned_buffer_util.hpp b/include/aligned_buffer_util.hpp index 8c83e3a9..456420bc 100644 --- a/include/aligned_buffer_util.hpp +++ b/include/aligned_buffer_util.hpp @@ -8,9 +8,6 @@ #include "buffer_manager.hpp" #include -#ifdef CPPUDDLE_HAVE_HPX -#include "hpx_buffer_util.hpp" -#endif namespace recycler { template ::value, int> = 0> using aggressive_recycle_aligned = detail::aggressive_recycle_allocator< T, boost::alignment::aligned_allocator>; -#ifdef CPPUDDLE_HAVE_HPX -template ::value, int> = 0> -using numa_aware_recycle_aligned = detail::numa_aware_recycle_allocator< - T, boost::alignment::aligned_allocator>; -template ::value, int> = 0> -using numa_aware_aggressive_recycle_aligned = - detail::numa_aware_aggressive_recycle_allocator< - T, boost::alignment::aligned_allocator>; -#endif } // namespace recycler #endif diff --git a/include/buffer_manager.hpp b/include/buffer_manager.hpp index cb4cf4c3..4cacb9b6 100644 --- a/include/buffer_manager.hpp +++ b/include/buffer_manager.hpp @@ -18,6 +18,18 @@ #include #include +// Warn about suboptimal performance without correct HPX-aware allocators +#ifdef CPPUDDLE_HAVE_HPX +#ifndef CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS +#pragma message \ +"Warning: CPPuddle build with HPX support but without HPX-aware allocators enabled. \ +For better performance configure CPPuddle with the cmake option CPPUDDLE_WITH_HPX_AWARE_ALLOCATORS=ON !" +#else +// include runtime to get HPX thread IDs required for the HPX-aware allocators +#include +#endif +#endif + #if defined(CPPUDDLE_HAVE_HPX) && defined(CPPUDDLE_HAVE_HPX_MUTEX) // For builds with The HPX mutex #include @@ -41,8 +53,12 @@ class buffer_recycler { // Public interface public: #if defined(CPPUDDLE_DEACTIVATE_BUFFER_RECYCLING) + +// Warn about suboptimal performance without recycling #pragma message \ - "Warning: Building without buffer recycling! Use only for performance testing!" +"Warning: Building without buffer recycling! Use only for performance testing! \ +For better performance configure CPPuddle with the cmake option CPPUDDLE_DEACTIVATE_BUFFER_RECYCLING=OFF !" + template static T *get(size_t number_elements, bool manage_content_lifetime = false, std::optional location_hint = std::nullopt) { @@ -463,10 +479,15 @@ class buffer_recycler { template struct recycle_allocator { using value_type = T; + const std::optional dealloc_hint; + +#ifndef CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS recycle_allocator() noexcept = default; template + explicit recycle_allocator(size_t hint) noexcept + : dealloc_hint(hint) {} explicit recycle_allocator( - recycle_allocator const &) noexcept {} + recycle_allocator const &other) noexcept {} T *allocate(std::size_t n) { T *data = buffer_recycler::get(n); return data; @@ -474,6 +495,24 @@ template struct recycle_allocator { void deallocate(T *p, std::size_t n) { buffer_recycler::mark_unused(p, n); } +#else + recycle_allocator() noexcept + : dealloc_hint(hpx::get_worker_thread_num()) {} + explicit recycle_allocator(size_t hint) noexcept + : dealloc_hint(hint) {} + explicit recycle_allocator( + recycle_allocator const &other) noexcept + : dealloc_hint(other.dealloc_hint) {} + T *allocate(std::size_t n) { + T *data = buffer_recycler::get( + n, false, hpx::get_worker_thread_num()); + return data; + } + void deallocate(T *p, std::size_t n) { + buffer_recycler::mark_unused(p, n, dealloc_hint); + } +#endif + template inline void construct(T *p, Args... args) noexcept { ::new (static_cast(p)) T(std::forward(args)...); @@ -503,6 +542,9 @@ operator!=(recycle_allocator const &, template struct aggressive_recycle_allocator { using value_type = T; + std::optional dealloc_hint; + +#ifndef CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS aggressive_recycle_allocator() noexcept = default; template explicit aggressive_recycle_allocator( @@ -515,6 +557,25 @@ struct aggressive_recycle_allocator { void deallocate(T *p, std::size_t n) { buffer_recycler::mark_unused(p, n); } +#else + aggressive_recycle_allocator() noexcept + : dealloc_hint(hpx::get_worker_thread_num()) {} + explicit aggressive_recycle_allocator(size_t hint) noexcept + : dealloc_hint(hint) {} + explicit aggressive_recycle_allocator( + recycle_allocator const &other) noexcept + : dealloc_hint(other.dealloc_hint) {} + T *allocate(std::size_t n) { + T *data = buffer_recycler::get( + n, true, hpx::get_worker_thread_num()); // also initializes the buffer + // if it isn't reused + return data; + } + void deallocate(T *p, std::size_t n) { + buffer_recycler::mark_unused(p, n, dealloc_hint); + } +#endif + #ifndef CPPUDDLE_DEACTIVATE_AGGRESSIVE_ALLOCATORS template inline void construct(T *p, Args... args) noexcept { @@ -524,8 +585,19 @@ struct aggressive_recycle_allocator { // Do nothing here - Contents will be destroyed when the buffer manager is // destroyed, not before } +#else +// Warn about suboptimal performance without recycling +#pragma message \ +"Warning: Building without content reusage for aggressive allocators! \ +For better performance configure with the cmake option CPPUDDLE_DEACTIVATE_AGGRESSIVE_ALLOCATORS=OFF !" + template + inline void construct(T *p, Args... args) noexcept { + ::new (static_cast(p)) T(std::forward(args)...); + } + void destroy(T *p) { p->~T(); } #endif }; + template constexpr bool operator==(aggressive_recycle_allocator const &, @@ -558,6 +630,8 @@ using aggressive_recycle_std = inline void force_cleanup() { detail::buffer_recycler::clean_all(); } /// Deletes all buffers currently marked as unused inline void cleanup() { detail::buffer_recycler::clean_unused_buffers(); } +/// Deletes all buffers (even ones still marked as used), delete the buffer +/// managers and the recycler itself. Disallows further usage. inline void finalize() { detail::buffer_recycler::finalize(); } } // end namespace recycler diff --git a/include/cuda_buffer_util.hpp b/include/cuda_buffer_util.hpp index d94c0708..d2d0f596 100644 --- a/include/cuda_buffer_util.hpp +++ b/include/cuda_buffer_util.hpp @@ -7,9 +7,6 @@ #define CUDA_BUFFER_UTIL_HPP #include "buffer_manager.hpp" -#ifdef CPPUDDLE_HAVE_HPX -#include "hpx_buffer_util.hpp" -#endif #include #include @@ -106,14 +103,6 @@ using recycle_allocator_cuda_host = template ::value, int> = 0> using recycle_allocator_cuda_device = detail::recycle_allocator>; -#ifdef CPPUDDLE_HAVE_HPX -template ::value, int> = 0> -using numa_aware_recycle_allocator_cuda_host = - detail::numa_aware_aggressive_recycle_allocator>; -template ::value, int> = 0> -using hpx_aware_recycle_allocator_cuda_device = - detail::numa_aware_recycle_allocator>; -#endif template ::value, int> = 0> struct cuda_device_buffer { diff --git a/include/hip_buffer_util.hpp b/include/hip_buffer_util.hpp index 1d1b3d61..5a4209c1 100644 --- a/include/hip_buffer_util.hpp +++ b/include/hip_buffer_util.hpp @@ -7,9 +7,6 @@ #define HIP_BUFFER_UTIL_HPP #include "buffer_manager.hpp" -#ifdef CPPUDDLE_HAVE_HPX -#include "hpx_buffer_util.hpp" -#endif #include #include @@ -112,15 +109,8 @@ using recycle_allocator_hip_host = template ::value, int> = 0> using recycle_allocator_hip_device = detail::recycle_allocator>; -#ifdef CPPUDDLE_HAVE_HPX -template ::value, int> = 0> -using numa_aware_recycle_allocator_hip_host = - detail::numa_aware_aggressive_recycle_allocator>; -template ::value, int> = 0> -using hpx_aware_recycle_allocator_hip_device = - detail::numa_aware_recycle_allocator>; -#endif +// TODO Is this even required? (cuda version should work fine...) template ::value, int> = 0> struct hip_device_buffer { size_t gpu_id{0}; diff --git a/include/hpx_buffer_util.hpp b/include/hpx_buffer_util.hpp deleted file mode 100644 index f1b8cdd6..00000000 --- a/include/hpx_buffer_util.hpp +++ /dev/null @@ -1,117 +0,0 @@ -// Copyright (c) 2023 Gregor Daiß -// -// Distributed under the Boost Software License, Version 1.0. (See accompanying -// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) - - -#ifndef CPPUDDLE_HPX_BUFFER_UTIL_HPP -#define CPPUDDLE_HPX_BUFFER_UTIL_HPP - -#include "buffer_manager.hpp" -#include - -namespace recycler { -namespace detail { - -template - struct numa_aware_recycle_allocator { - using value_type = T; - const std::optional dealloc_hint; - numa_aware_recycle_allocator() noexcept - : dealloc_hint(hpx::get_worker_thread_num()) {} - explicit numa_aware_recycle_allocator(size_t hint) noexcept - : dealloc_hint(hint) {} - explicit numa_aware_recycle_allocator( - numa_aware_recycle_allocator const &) noexcept {} - T *allocate(std::size_t n) { - T *data = buffer_recycler::get( - n, false, hpx::get_worker_thread_num()); - return data; - } - void deallocate(T *p, std::size_t n) { - buffer_recycler::mark_unused(p, n, dealloc_hint); - } - template - inline void construct(T *p, Args... args) noexcept { - ::new (static_cast(p)) T(std::forward(args)...); - } - void destroy(T *p) { p->~T(); } -}; -template -constexpr bool -operator==(numa_aware_recycle_allocator const &, - numa_aware_recycle_allocator const &) noexcept { - if constexpr (std::is_same_v) - return true; - else - return false; -} -template -constexpr bool -operator!=(numa_aware_recycle_allocator const &, - numa_aware_recycle_allocator const &) noexcept { - if constexpr (std::is_same_v) - return false; - else - return true; -} - -/// Recycles not only allocations but also the contents of a buffer -template -struct numa_aware_aggressive_recycle_allocator { - using value_type = T; - std::optional dealloc_hint; - numa_aware_aggressive_recycle_allocator() noexcept - : dealloc_hint(hpx::get_worker_thread_num()) {} - explicit numa_aware_aggressive_recycle_allocator(size_t hint) noexcept - : dealloc_hint(hint) {} - explicit numa_aware_aggressive_recycle_allocator( - numa_aware_recycle_allocator const &) noexcept {} - T *allocate(std::size_t n) { - T *data = buffer_recycler::get( - n, true, hpx::get_worker_thread_num()); // also initializes the buffer - // if it isn't reused - return data; - } - void deallocate(T *p, std::size_t n) { - buffer_recycler::mark_unused(p, n, dealloc_hint); - } -#ifndef CPPUDDLE_DEACTIVATE_AGGRESSIVE_ALLOCATORS - template - inline void construct(T *p, Args... args) noexcept { - // Do nothing here - we reuse the content of the last owner - } - void destroy(T *p) { - // Do nothing here - Contents will be destroyed when the buffer manager is - // destroyed, not before - } -#endif -}; -template -constexpr bool -operator==(numa_aware_aggressive_recycle_allocator const &, - numa_aware_aggressive_recycle_allocator const &) noexcept { - if constexpr (std::is_same_v) - return true; - else - return false; -} -template -constexpr bool -operator!=(numa_aware_aggressive_recycle_allocator const &, - numa_aware_aggressive_recycle_allocator const &) noexcept { - if constexpr (std::is_same_v) - return false; - else - return true; -} - -} -template ::value, int> = 0> -using numa_aware_recycle_std = detail::numa_aware_recycle_allocator>; -template ::value, int> = 0> -using numa_aware_aggressive_recycle_std = - detail::numa_aware_aggressive_recycle_allocator>; -} - -#endif diff --git a/include/sycl_buffer_util.hpp b/include/sycl_buffer_util.hpp index 28014da5..6469aa4e 100644 --- a/include/sycl_buffer_util.hpp +++ b/include/sycl_buffer_util.hpp @@ -7,9 +7,6 @@ #define SYCL_BUFFER_UTIL_HPP #include "buffer_manager.hpp" -#ifdef CPPUDDLE_HAVE_HPX -#include "hpx_buffer_util.hpp" -#endif #include #include @@ -79,14 +76,6 @@ using recycle_allocator_sycl_host = template ::value, int> = 0> using recycle_allocator_sycl_device = detail::recycle_allocator>; -#ifdef CPPUDDLE_HAVE_HPX -template ::value, int> = 0> -using numa_aware_recycle_allocator_sycl_host = - detail::numa_aware_aggressive_recycle_allocator>; -template ::value, int> = 0> -using hpx_aware_recycle_allocator_sycl_device = - detail::numa_aware_recycle_allocator>; -#endif } // end namespace recycler #endif diff --git a/tests/allocator_hpx_test.cpp b/tests/allocator_hpx_test.cpp index 2e48f925..4d11cc16 100644 --- a/tests/allocator_hpx_test.cpp +++ b/tests/allocator_hpx_test.cpp @@ -16,7 +16,6 @@ #include #include "../include/buffer_manager.hpp" -#include "../include/hpx_buffer_util.hpp" int hpx_main(int argc, char *argv[]) { @@ -75,102 +74,6 @@ int hpx_main(int argc, char *argv[]) { assert(number_futures >= 1); // NOLINT assert(number_futures <= max_number_futures); // NOLINT - { - size_t aggressive_duration = 0; - size_t recycle_duration = 0; - size_t default_duration = 0; - - // test using std::allocator: - { - auto begin = std::chrono::high_resolution_clock::now(); - std::vector> futs(max_number_futures); - for (size_t i = 0; i < max_number_futures; i++) { - futs[i] = hpx::make_ready_future(); - } - for (size_t pass = 0; pass < passes; pass++) { - for (size_t i = 0; i < number_futures; i++) { - futs[i] = futs[i].then([&](hpx::shared_future &&predecessor) { - std::vector test6(array_size, double{}); - }); - } - } - auto when = hpx::when_all(futs); - when.wait(); - auto end = std::chrono::high_resolution_clock::now(); - default_duration = - std::chrono::duration_cast(end - begin) - .count(); - std::cout << "\n==> Non-recycle allocation test took " << default_duration - << "ms" << std::endl; - } - - // test using normal recycle allocator - { - auto begin = std::chrono::high_resolution_clock::now(); - std::vector> futs(max_number_futures); - for (size_t i = 0; i < max_number_futures; i++) { - futs[i] = hpx::make_ready_future(); - } - for (size_t pass = 0; pass < passes; pass++) { - for (size_t i = 0; i < number_futures; i++) { - futs[i] = futs[i].then([&](hpx::shared_future &&predecessor) { - std::vector> test6(array_size, - double{}); - }); - } - } - auto when = hpx::when_all(futs); - when.wait(); - auto end = std::chrono::high_resolution_clock::now(); - recycle_duration = - std::chrono::duration_cast(end - begin) - .count(); - std::cout << "\n==> NUMA-aware recycle allocation test took " << recycle_duration - << "ms" << std::endl; - } - recycler::force_cleanup(); // Cleanup all buffers and the managers for better - // comparison - - // Aggressive recycle Test: - { - auto begin = std::chrono::high_resolution_clock::now(); - std::vector> futs(max_number_futures); - for (size_t i = 0; i < max_number_futures; i++) { - futs[i] = hpx::make_ready_future(); - } - for (size_t pass = 0; pass < passes; pass++) { - for (size_t i = 0; i < number_futures; i++) { - futs[i] = futs[i].then([&](hpx::shared_future &&predecessor) { - std::vector> test6( - array_size, double{}); - }); - } - } - auto when = hpx::when_all(futs); - when.wait(); - auto end = std::chrono::high_resolution_clock::now(); - aggressive_duration = - std::chrono::duration_cast(end - begin) - .count(); - std::cout << "\n==> NUMA-aware aggressive recycle allocation test took " - << aggressive_duration << "ms" << std::endl; - } - recycler::force_cleanup(); // Cleanup all buffers and the managers for better - // comparison - - - - if (aggressive_duration < recycle_duration) { - std::cout << "Test information: NUMA-aware aggressive recycler was faster than normal " - "recycler!" - << std::endl; - } - if (recycle_duration < default_duration) { - std::cout << "Test information: NUMA-aware recycler was faster than default allocator!" - << std::endl; - } - } - { size_t aggressive_duration = 0; size_t recycle_duration = 0; From df8245c19e87a4574222e22dfc27b45e87781067 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= Date: Tue, 6 Jun 2023 14:22:55 -0500 Subject: [PATCH 40/46] Fix test --- CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 26ec306b..22970c8a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -147,7 +147,7 @@ add_library(buffer_manager INTERFACE) if (CPPUDDLE_WITH_HPX) target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_HAVE_HPX") if(CPPUDDLE_WITH_HPX_AWARE_ALLOCATORS) - message(INFO " Compiling HPX-aware allocators!") + message(INFO " Compiling with HPX-aware allocators!") target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS") endif() endif() @@ -456,7 +456,7 @@ if (CPPUDDLE_WITH_TESTS) add_test(allocator_concurrency_test.analyse_recycle_rate cat allocator_concurrency_test.out) set_tests_properties(allocator_concurrency_test.analyse_recycle_rate PROPERTIES FIXTURES_REQUIRED allocator_concurrency_output - PASS_REGULAR_EXPRESSION "==> Recycle rate: [ ]* 99.5%" + PASS_REGULAR_EXPRESSION "==> Recycle rate: [ ]* 99.*%" ) add_test(allocator_concurrency_test.analyse_marked_buffers_cleanup cat allocator_concurrency_test.out) set_tests_properties(allocator_concurrency_test.analyse_marked_buffers_cleanup PROPERTIES From 36af4a1695d25b56fd05b2dfa488621ed8a377d1 Mon Sep 17 00:00:00 2001 From: Gregor Daiss Date: Tue, 6 Jun 2023 15:13:23 -0500 Subject: [PATCH 41/46] Fix non-hpx-aware build --- include/aggregation_manager.hpp | 20 +++++++++++--------- include/buffer_manager.hpp | 18 +++++++++++------- 2 files changed, 22 insertions(+), 16 deletions(-) diff --git a/include/aggregation_manager.hpp b/include/aggregation_manager.hpp index 8c51c182..fe1de846 100644 --- a/include/aggregation_manager.hpp +++ b/include/aggregation_manager.hpp @@ -556,15 +556,17 @@ template class Aggregated_Executor { if (buffer_counter <= slice_alloc_counter) { constexpr bool manage_content_lifetime = false; buffers_in_use = true; -#ifndef CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS - // Deactivated HPX-aware allocation... - const size_t location_id = 0; - // not recommended for performance -#pragma message \ -"Warning: Running work aggregation without HPX-aware allocators enabled. Performance negatively impacted!" -#else - // get prefered location: aka the current hpx threads location - const size_t location_id = hpx::get_worker_thread_num(); + + // Default location -- useful for GPU builds as we otherwise create way too + // many different buffers for different aggregation sizes on different GPUs + size_t location_id = 0; +#ifdef CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS + if (max_slices == 1) { + // get prefered location: aka the current hpx threads location + // Usually handy for CPU builds where we want to use the buffers + // close to the current CPU core + location_id = hpx::get_worker_thread_num(); + } #endif // Get shiny and new buffer that will be shared between all slices // Buffer might be recycled from previous allocations by the diff --git a/include/buffer_manager.hpp b/include/buffer_manager.hpp index 4cacb9b6..cc2a6d07 100644 --- a/include/buffer_manager.hpp +++ b/include/buffer_manager.hpp @@ -482,12 +482,13 @@ template struct recycle_allocator { const std::optional dealloc_hint; #ifndef CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS - recycle_allocator() noexcept = default; - template + recycle_allocator() noexcept + : dealloc_hint(std::nullopt) {} explicit recycle_allocator(size_t hint) noexcept - : dealloc_hint(hint) {} + : dealloc_hint(std::nullopt) {} explicit recycle_allocator( - recycle_allocator const &other) noexcept {} + recycle_allocator const &other) noexcept + : dealloc_hint(std::nullopt) {} T *allocate(std::size_t n) { T *data = buffer_recycler::get(n); return data; @@ -545,10 +546,13 @@ struct aggressive_recycle_allocator { std::optional dealloc_hint; #ifndef CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS - aggressive_recycle_allocator() noexcept = default; - template + aggressive_recycle_allocator() noexcept + : dealloc_hint(std::nullopt) {} + explicit aggressive_recycle_allocator(size_t hint) noexcept + : dealloc_hint(std::nullopt) {} explicit aggressive_recycle_allocator( - aggressive_recycle_allocator const &) noexcept {} + aggressive_recycle_allocator const &) noexcept + : dealloc_hint(std::nullopt) {} T *allocate(std::size_t n) { T *data = buffer_recycler::get( n, true); // also initializes the buffer if it isn't reused From 29b4f3383d34d22e998c7a33e7f160c77bf27d1d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= Date: Tue, 6 Jun 2023 15:43:44 -0500 Subject: [PATCH 42/46] Remove noexcept form stream_manager --- include/stream_manager.hpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/include/stream_manager.hpp b/include/stream_manager.hpp index f87503cb..1d48bcd6 100644 --- a/include/stream_manager.hpp +++ b/include/stream_manager.hpp @@ -277,24 +277,24 @@ class stream_pool { } } - static std::tuple get_interface() noexcept { + static std::tuple get_interface() { std::lock_guard guard(pool_mut); assert(pool_instance); // should already be initialized return pool_instance->streampool->get_interface(); } - static void release_interface(size_t index) noexcept { + static void release_interface(size_t index) { std::lock_guard guard(pool_mut); assert(pool_instance); // should already be initialized pool_instance->streampool->release_interface(index); } - static bool interface_available(size_t load_limit) noexcept { + static bool interface_available(size_t load_limit) { std::lock_guard guard(pool_mut); if (!pool_instance) { return false; } return pool_instance->streampool->interface_available(load_limit); } - static size_t get_current_load() noexcept { + static size_t get_current_load() { std::lock_guard guard(pool_mut); if (!pool_instance) { return 0; @@ -302,7 +302,7 @@ class stream_pool { assert(pool_instance); // should already be initialized return pool_instance->streampool->get_current_load(); } - static size_t get_next_device_id() noexcept { + static size_t get_next_device_id() { std::lock_guard guard(pool_mut); if (!pool_instance) { return 0; From 921b502a14a3f19710f13acc71b9f29e9dde92ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= Date: Tue, 6 Jun 2023 15:55:01 -0500 Subject: [PATCH 43/46] Update readme --- README.md | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 9e3adb32..38f8f88c 100644 --- a/README.md +++ b/README.md @@ -16,23 +16,32 @@ In this use-case, allocating GPU buffers for all sub-grids in advance would have - Allocators that reuse previousely allocated buffers if available (works with normal heap memory, pinned memory, aligned memory, CUDA/HIP device memory, and Kokkos Views). Note that separate buffers do not coexist on a single chunk of continuous memory, but use different allocations. - Executor pools and various scheduling policies (round robin, priority queue, multi-gpu), which rely on reference counting to gauge the current load of a executor instead of querying the device itself. Tested with CUDA, HIP and Kokkos executors provided by HPX / HPX-Kokkos. +- Special Executors/Allocators for on-the-fly work GPU aggregation (using HPX). #### Requirements -- C++14 -- CMake (>= 3.11) +- C++17 +- CMake (>= 3.16) - Optional (for the header-only utilities / test): CUDA, Boost, [HPX](https://github.com/STEllAR-GROUP/hpx), [Kokkos](https://github.com/kokkos/kokkos), [HPX-Kokkos](https://github.com/STEllAR-GROUP/hpx-kokkos) The submodules can be used to obtain the optional dependencies which are required for testing the header-only utilities. If these tests are not required, the submodule (and the respective buildscripts in /scripts) can be ignored safely. #### Build / Install +Basic build + ``` cmake -H/path/to/source -B$/path/to/build -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/path/to/install/cppuddle -DCPPUDDLE_WITH_TESTS=OFF -DCPPUDDLE_WITH_COUNTERS=OFF - cmake --build /path/to/build -- -j4 VERBOSE=1 cmake --build /path/to/build --target install ``` -If installed correctly, cppuddle can be used in other cmake-based projects via +If installed correctly, CPPuddle can be used in other CMake-based projects via ``` find_package(CPPuddle REQUIRED) ``` + +Recommended build: +``` + cmake -H/path/to/source -B$/path/to/build -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/path/to/install/cppuddle -DCPPUDDLE_WITH_HPX=ON -DCPPUDDLE_WITH_HPX_AWARE_ALLOCATORS=ON -DCPPUDDLE_WITH_TESTS=OFF -DCPPUDDLE_WITH_COUNTERS=OFF +``` + + From af1900e48b7acd8eb471cdb1880ca6ecefcf3db6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= Date: Tue, 6 Jun 2023 16:52:15 -0500 Subject: [PATCH 44/46] New default build --- scripts/configure_build_directory.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/configure_build_directory.sh b/scripts/configure_build_directory.sh index 1b2f6fd8..c949e8a3 100755 --- a/scripts/configure_build_directory.sh +++ b/scripts/configure_build_directory.sh @@ -54,9 +54,9 @@ mkdir -p ${INSTALL_DIR} pushd ${BUILD_DIR} # TODO Reactivate CUDA/KOKKOS once we have a newer cmake version on the test machine if [[ "${CXX}" == "clang++" ]]; then # clang/cmake too old on our usual machine - compile without CUDA - cmake -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DCMAKE_INSTALL_PREFIX=${INSTALL_DIR} -DCMAKE_FIND_PACKAGE_NO_PACKAGE_REGISTRY=ON -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DCPPUDDLE_WITH_TESTS=ON -DCPPUDDLE_WITH_HPX=ON -DCPPUDDLE_MUTEXLESS_MODE=OFF -DCPPUDDLE_WITH_CUDA=ON -DCPPUDDLE_WITH_KOKKOS=ON -DCPPUDDLE_WITH_COUNTERS=ON -DHPX_DIR=${HPX_ROOT} -DKokkos_DIR=${Kokkos_ROOT} -DHPXKokkos_DIR=${HPXKokkos_ROOT} ../.. + cmake -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DCMAKE_INSTALL_PREFIX=${INSTALL_DIR} -DCMAKE_FIND_PACKAGE_NO_PACKAGE_REGISTRY=ON -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DCPPUDDLE_WITH_TESTS=ON -DCPPUDDLE_WITH_HPX=OFF -DCPPUDDLE_WITH_CUDA=OFF -DCPPUDDLE_WITH_KOKKOS=OFF -DCPPUDDLE_WITH_COUNTERS=ON -DHPX_DIR=${HPX_ROOT} -DKokkos_DIR=${Kokkos_ROOT} -DHPXKokkos_DIR=${HPXKokkos_ROOT} ../.. else - cmake -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DCMAKE_INSTALL_PREFIX=${INSTALL_DIR} -DCMAKE_FIND_PACKAGE_NO_PACKAGE_REGISTRY=ON -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DCPPUDDLE_WITH_TESTS=ON -DCPPUDDLE_WITH_HPX=ON -DCPPUDDLE_MUTEXLESS_MODE=ON -DCPPUDDLE_WITH_CUDA=ON -DCPPUDDLE_WITH_KOKKOS=ON -DCPPUDDLE_WITH_COUNTERS=ON -DHPX_DIR=${HPX_ROOT} -DKokkos_DIR=${Kokkos_ROOT} -DHPXKokkos_DIR=${HPXKokkos_ROOT} ../.. + cmake -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DCMAKE_INSTALL_PREFIX=${INSTALL_DIR} -DCMAKE_FIND_PACKAGE_NO_PACKAGE_REGISTRY=ON -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DCPPUDDLE_WITH_TESTS=ON -DCPPUDDLE_WITH_HPX=OFF -DCPPUDDLE_WITH_CUDA=OFF -DCPPUDDLE_WITH_KOKKOS=OFF -DCPPUDDLE_WITH_COUNTERS=ON -DHPX_DIR=${HPX_ROOT} -DKokkos_DIR=${Kokkos_ROOT} -DHPXKokkos_DIR=${HPXKokkos_ROOT} ../.. fi popd cp ${BUILD_DIR}/compile_commands.json ${SCRIPTS_DIR}/../compile_commands.json From 77bedc4807c5a5604635bd31c769a6ec9638c5a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= Date: Tue, 6 Jun 2023 16:52:28 -0500 Subject: [PATCH 45/46] Make compatible with older user code by swapping exception for warning --- include/buffer_manager.hpp | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/include/buffer_manager.hpp b/include/buffer_manager.hpp index fdf00d53..ddbffb70 100644 --- a/include/buffer_manager.hpp +++ b/include/buffer_manager.hpp @@ -260,6 +260,7 @@ For better performance configure CPPuddle with the cmake option CPPUDDLE_DEACTIV // not enough memory left! Cleanup and attempt again: std::cerr << "Not enough memory left. Cleaning up unused buffers now..." << std::endl; buffer_recycler::clean_unused_buffers(); + std::cerr << "Buffers cleaned! Try allocation again..." << std::endl; // If there still isn't enough memory left, the caller has to handle it // We've done all we can in here @@ -272,6 +273,7 @@ For better performance configure CPPuddle with the cmake option CPPUDDLE_DEACTIV instance()[location_id].number_creation++; instance()[location_id].number_bad_alloc++; #endif + std::cerr << "Second attempt allocation successful!" << std::endl; if (manage_content_lifetime) { std::uninitialized_value_construct_n(buffer, number_of_elements); } @@ -334,8 +336,18 @@ For better performance configure CPPuddle with the cmake option CPPUDDLE_DEACTIV } } - // Failure -- something is very wrong - throw std::runtime_error("Tried to delete non-existing buffer"); + // TODO Throw exception instead in the futures, as soon as the recycler finalize is + // in all user codes + /* throw std::runtime_error("Tried to delete non-existing buffer"); */ + + // This is odd: Print warning -- however, might also happen with static + // buffers using these allocators IF the new finalize was not called. For + // now, print warning until all user-code is upgraded to the finalize method. + // This allows using current versions of cppuddle with older application code + std::cerr + << "Warning! Tried to delete non-existing buffer within CPPuddle!" + << std::endl; + std::cerr << "Did you forget to call recycler::finalize?" << std::endl; } private: From 7e16672b22b8e1d46c9f11e59a0dcab8367c55f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= Date: Tue, 6 Jun 2023 17:11:33 -0500 Subject: [PATCH 46/46] Adapt test to aggregation alloc location changes --- CMakeLists.txt | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 22970c8a..12674d2e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -572,16 +572,15 @@ if (CPPUDDLE_WITH_TESTS) FIXTURES_SETUP aggregation_basic_parallel_test_output PROCESSORS 4 ) - # new concurrent buffer managers change the game here.. as two aggregated runs are in parallel: 3 1 add_test(aggregation_basic_parallel_test.analyse_int_buffers cat aggregation_basic_parallel_test.out) set_tests_properties(aggregation_basic_parallel_test.analyse_int_buffers PROPERTIES FIXTURES_REQUIRED aggregation_basic_parallel_test_output - PASS_REGULAR_EXPRESSION "--> Number of buffers that got requested from this manager: [ ]* 1" + PASS_REGULAR_EXPRESSION "--> Number of buffers that got requested from this manager: [ ]* 2" ) add_test(aggregation_basic_parallel_test.analyse_float_buffers cat aggregation_basic_parallel_test.out) set_tests_properties(aggregation_basic_parallel_test.analyse_float_buffers PROPERTIES FIXTURES_REQUIRED aggregation_basic_parallel_test_output - PASS_REGULAR_EXPRESSION "--> Number of buffers that got requested from this manager: [ ]* 3" + PASS_REGULAR_EXPRESSION "--> Number of buffers that got requested from this manager: [ ]* 6" ) add_test(aggregation_basic_parallel_test.analyse_cleanup cat aggregation_basic_parallel_test.out) set_tests_properties(aggregation_basic_parallel_test.analyse_cleanup PROPERTIES @@ -642,7 +641,7 @@ if (CPPUDDLE_WITH_TESTS) add_test(aggregation_add_pointer_test.analyse_number_buffers cat aggregation_add_pointer_test.out) set_tests_properties(aggregation_add_pointer_test.analyse_number_buffers PROPERTIES FIXTURES_REQUIRED aggregation_add_pointer_test_output - PASS_REGULAR_EXPRESSION "--> Number of buffers that got requested from this manager: [ ]* 3" + PASS_REGULAR_EXPRESSION "--> Number of buffers that got requested from this manager: [ ]* 6" ) add_test(aggregation_add_references_test_sequential.run work_aggregation_test --hpx:threads=1 --outputfile=aggregation_add_references_test_sequential.out --scenario=references_add_test) @@ -670,12 +669,11 @@ if (CPPUDDLE_WITH_TESTS) FIXTURES_REQUIRED aggregation_add_references_test_output PASS_REGULAR_EXPRESSION "Number add_launches=1" ) - # TODO Re-enable test as soon as we have aggregated counters... - # add_test(aggregation_add_references_test.analyse_number_buffers cat aggregation_add_references_test.out) - # set_tests_properties(aggregation_add_references_test.analyse_number_buffers PROPERTIES - # FIXTURES_REQUIRED aggregation_add_references_test_output - # PASS_REGULAR_EXPRESSION "--> Number of buffers that got requested from this manager: [ ]* 3" - # ) + add_test(aggregation_add_references_test.analyse_number_buffers cat aggregation_add_references_test.out) + set_tests_properties(aggregation_add_references_test.analyse_number_buffers PROPERTIES + FIXTURES_REQUIRED aggregation_add_references_test_output + PASS_REGULAR_EXPRESSION "--> Number of buffers that got requested from this manager: [ ]* 3" + ) # STREAM TESTS CPU