From 80f8ff530817f6fb66b09d133000d56f677fd449 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gregor=20Dai=C3=9F?= <Gregor.Daiss+git@gmail.com>
Date: Sun, 11 Jun 2023 23:32:26 +0200
Subject: [PATCH 01/42] Add stream manager per gpu and set device id for
 mallocs

---
 CMakeLists.txt                  |  29 +++++-
 include/aggregation_manager.hpp |  94 ++++++++++--------
 include/buffer_manager.hpp      |  22 ++---
 include/cuda_buffer_util.hpp    |  43 ++-------
 include/detail/config.hpp       |  64 +++++++++++++
 include/hip_buffer_util.hpp     |  23 +----
 include/stream_manager.hpp      | 163 ++++++++++++++++----------------
 7 files changed, 246 insertions(+), 192 deletions(-)
 create mode 100644 include/detail/config.hpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 12674d2e..5801e499 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -23,15 +23,17 @@ set(CPPUDDLE_VERSION_STRING "${CPPUDDLE_VERSION_MAJOR}.${CPPUDDLE_VERSION_MINOR}
 option(CPPUDDLE_WITH_CUDA "Enable CUDA tests/examples" OFF)
 option(CPPUDDLE_WITH_MULTIGPU_SUPPORT "Enables experimental MultiGPU support" OFF)
 option(CPPUDDLE_WITH_KOKKOS "Enable KOKKOS tests/examples" OFF)
+set(CPPUDDLE_WITH_NUMBER_GPUS "1" CACHE STRING "Number of GPUs that will be used. Should match the number of GPUs used when using the maximum number of HPX worker threads. Should be 1 for non-HPX builds.")
 # HPX-related options
 option(CPPUDDLE_WITH_HPX "Enable basic HPX integration and examples" OFF)
 option(CPPUDDLE_WITH_HPX_AWARE_ALLOCATORS "Enable HPX-aware allocators for even better HPX integration" ON)
 set(CPPUDDLE_WITH_HPX_MUTEX OFF CACHE BOOL
   "Use HPX spinlock mutex instead of std::mutex")
+set(CPPUDDLE_WITH_MAX_NUMBER_WORKERS "128" CACHE STRING "Max number of workers threads supported. Should match the intended number of HPX workers or be 1 in non-HPX builds.")
 # Test-related options
 option(CPPUDDLE_WITH_COUNTERS "Turns on allocations counters. Useful for extended testing" OFF)
 option(CPPUDDLE_WITH_TESTS "Build tests/examples" OFF)
-set(CPPUDDLE_WITH_DEADLOCK_TEST_REPETITONS "100000" CACHE STRING "Number of repetitions for the aggregation executor deadlock tests")
+set(CPPUDDLE_WITH_DEADLOCK_TEST_REPETITONS "10000" CACHE STRING "Number of repetitions for the aggregation executor deadlock tests")
 option(CPPUDDLE_DEACTIVATE_BUFFER_RECYCLING "Deactivates the default recycling behaviour" OFF)
 option(CPPUDDLE_DEACTIVATE_AGGRESSIVE_ALLOCATORS "Deactivates the aggressive allocators" OFF)
 # Tooling options
@@ -61,6 +63,19 @@ if(CPPUDDLE_WITH_HPX)
   endif()
 endif()
 
+if(CPPUDDLE_WITH_NUMBER_GPUS GREATER 1)
+  if(NOT CPPUDDLE_WITH_HPX_AWARE_ALLOCATORS)
+    message(FATAL_ERROR " CPPUDDLE_WITH_HPX_AWARE_ALLOCATORS=ON is required Multi-GPU builds!")
+  endif()
+endif()
+
+if(CPPUDDLE_WITH_MAX_NUMBER_WORKERS GREATER 1)
+  if(NOT CPPUDDLE_WITH_HPX_AWARE_ALLOCATORS)
+    message(FATAL_ERROR " CPPUDDLE_WITH_HPX_AWARE_ALLOCATORS=ON is required for Multi-Worker build! \
+    Either turn it on or configure with CPPUDDLE_WITH_MAX_NUMBER_WORKERS=1 !")
+  endif()
+endif()
+
 # HPX-aware allocators require HPX-Support. Warn if HPX support is disabled as we fallback on non-aware
 # allocators
 if(NOT CPPUDDLE_WITH_HPX)
@@ -80,7 +95,7 @@ if (CPPUDDLE_WITH_KOKKOS)
   find_package(Kokkos 3.0.0 REQUIRED)
   find_package(HPXKokkos REQUIRED) 
 
-  # Check that everything required is actyivated
+  # Check that everything required is activated
   if (NOT CPPUDDLE_WITH_HPX)
     message(FATAL_ERROR " KOKKOS support requires HPX flag to be turned on")
   endif()
@@ -149,6 +164,11 @@ if (CPPUDDLE_WITH_HPX)
   if(CPPUDDLE_WITH_HPX_AWARE_ALLOCATORS)
     message(INFO " Compiling with HPX-aware allocators!")
     target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS")
+    target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_MAX_NUMBER_GPUS=${CPPUDDLE_WITH_NUMBER_GPUS}")
+    target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_MAX_NUMBER_WORKERS=${CPPUDDLE_WITH_MAX_NUMBER_WORKERS}")
+  else()
+    target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_MAX_NUMBER_GPUS=1")
+    target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_MAX_NUMBER_WORKERS=1")
   endif()
 endif()
 if (CPPUDDLE_WITH_COUNTERS)
@@ -164,6 +184,11 @@ if (CPPUDDLE_WITH_HPX)
   target_compile_definitions(stream_manager INTERFACE "CPPUDDLE_HAVE_HPX")
   if(CPPUDDLE_WITH_HPX_AWARE_ALLOCATORS)
     target_compile_definitions(stream_manager INTERFACE "CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS")
+    target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_MAX_NUMBER_GPUS=${CPPUDDLE_WITH_NUMBER_GPUS}")
+    target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_MAX_NUMBER_WORKERS=${CPPUDDLE_WITH_MAX_NUMBER_WORKERS}")
+  else()
+    target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_MAX_NUMBER_GPUS=1")
+    target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_MAX_NUMBER_WORKERS=1")
   endif()
 endif()
 if (CPPUDDLE_WITH_COUNTERS)
diff --git a/include/aggregation_manager.hpp b/include/aggregation_manager.hpp
index fe1de846..47803043 100644
--- a/include/aggregation_manager.hpp
+++ b/include/aggregation_manager.hpp
@@ -1,3 +1,8 @@
+// Copyright (c) 2022-2023 Gregor Daiß
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
 #ifndef WORK_AGGREGATION_MANAGER
 #define WORK_AGGREGATION_MANAGER
 
@@ -27,7 +32,6 @@
 #include <hpx/include/iostreams.hpp>
 #include <hpx/include/lcos.hpp>
 #include <hpx/lcos/promise.hpp>
-//#include <hpx/synchronization/mutex.hpp> // obsolete
 #include <hpx/mutex.hpp>
 
 #if defined(HPX_HAVE_CUDA) || defined(HPX_HAVE_HIP)
@@ -41,12 +45,7 @@
 
 #include "../include/buffer_manager.hpp"
 #include "../include/stream_manager.hpp"
-
-#if defined(CPPUDDLE_HAVE_HPX_MUTEX)
-using aggregation_mutex_t = hpx::spinlock;
-#else
-using aggregation_mutex_t = std::mutex;
-#endif
+#include "../include/detail/config.hpp"
 
 //===============================================================================
 //===============================================================================
@@ -313,7 +312,8 @@ template <typename Executor> class aggregated_function_call {
         potential_async_promises[local_counter].get_future();
     if (local_counter == number_slices - 1) {
       /* slices_ready_promise.set_value(); */
-      auto fut = exec_async_wrapper<Executor, F, Ts...>(underlying_executor, std::forward<F>(f), std::forward<Ts>(ts)...);
+      auto fut = exec_async_wrapper<Executor, F, Ts...>(
+          underlying_executor, std::forward<F>(f), std::forward<Ts>(ts)...);
       fut.then([this](auto &&fut) {
         for (auto &promise : potential_async_promises) {
           promise.set_value();
@@ -808,8 +808,10 @@ template <typename Executor> class Aggregated_Executor {
       if (local_slice_id == 1) {
         // Renew promise that all slices will be ready as the primary launch criteria...
         hpx::lcos::shared_future<void> fut;
-        if (mode == Aggregated_Executor_Modes::EAGER || mode == Aggregated_Executor_Modes::ENDLESS) {
-          // Fallback launch condidtion: Launch as soon as the underlying stream is ready
+        if (mode == Aggregated_Executor_Modes::EAGER ||
+            mode == Aggregated_Executor_Modes::ENDLESS) {
+          // Fallback launch condidtion: Launch as soon as the underlying stream
+          // is ready
           /* auto slices_full_fut = slices_full_promise.get_future(); */
           auto exec_fut = executor.get_future(); 
           /* fut = hpx::when_any(exec_fut, slices_full_fut); */
@@ -835,15 +837,16 @@ template <typename Executor> class Aggregated_Executor {
       }
       if (local_slice_id >= max_slices &&
           mode != Aggregated_Executor_Modes::ENDLESS) {
-        slices_exhausted = true; // prevents any more threads from entering before the continuation is launched
-          /* launched_slices = current_slices; */
-          /* size_t id = 0; */
-          /* for (auto &slice_promise : executor_slices) { */
-          /*   slice_promise.set_value( */
-          /*       Executor_Slice{*this, id, launched_slices}); */
-          /*   id++; */
-          /* } */
-          /* executor_slices.clear(); */
+        slices_exhausted = true; // prevents any more threads from entering
+                                 // before the continuation is launched
+        /* launched_slices = current_slices; */
+        /* size_t id = 0; */
+        /* for (auto &slice_promise : executor_slices) { */
+        /*   slice_promise.set_value( */
+        /*       Executor_Slice{*this, id, launched_slices}); */
+        /*   id++; */
+        /* } */
+        /* executor_slices.clear(); */
         if (mode == Aggregated_Executor_Modes::STRICT ) {
           slices_full_promise.set_value(); // Trigger slices launch condition continuation 
         }
@@ -1005,38 +1008,40 @@ class aggregation_pool {
   template <typename... Ts>
   static void init(size_t number_of_executors, size_t slices_per_executor,
                    Aggregated_Executor_Modes mode) {
-    std::lock_guard<aggregation_mutex_t> guard(instance.pool_mutex);
-    assert(instance.aggregation_executor_pool.empty());
+    const size_t gpu_id = get_device_id();
+    std::lock_guard<aggregation_mutex_t> guard(instance()[gpu_id].pool_mutex);
+    assert(instance()[gpu_id].aggregation_executor_pool.empty());
     for (int i = 0; i < number_of_executors; i++) {
-      instance.aggregation_executor_pool.emplace_back(slices_per_executor,
+      instance()[gpu_id].aggregation_executor_pool.emplace_back(slices_per_executor,
                                                       mode);
     }
-    instance.slices_per_executor = slices_per_executor;
-    instance.mode = mode;
+    instance()[gpu_id].slices_per_executor = slices_per_executor;
+    instance()[gpu_id].mode = mode;
   }
 
   /// Will always return a valid executor slice
   static decltype(auto) request_executor_slice(void) {
-    std::lock_guard<aggregation_mutex_t> guard(instance.pool_mutex);
-    assert(!instance.aggregation_executor_pool.empty());
+    const size_t gpu_id = get_device_id();
+    std::lock_guard<aggregation_mutex_t> guard(instance()[gpu_id].pool_mutex);
+    assert(!instance()[gpu_id].aggregation_executor_pool.empty());
     std::optional<hpx::lcos::future<
         typename Aggregated_Executor<Interface>::Executor_Slice>>
         ret;
-    size_t local_id = (instance.current_interface) %
-                      instance.aggregation_executor_pool.size();
-    ret = instance.aggregation_executor_pool[local_id].request_executor_slice();
+    size_t local_id = (instance()[gpu_id].current_interface) %
+                      instance()[gpu_id].aggregation_executor_pool.size();
+    ret = instance()[gpu_id].aggregation_executor_pool[local_id].request_executor_slice();
     // Expected case: current aggregation executor is free
     if (ret.has_value()) {
       return ret;
     }
     // current interface is bad -> find free one
     size_t abort_counter = 0;
-    const size_t abort_number = instance.aggregation_executor_pool.size() + 1;
+    const size_t abort_number = instance()[gpu_id].aggregation_executor_pool.size() + 1;
     do {
-      local_id = (++(instance.current_interface)) % // increment interface
-                 instance.aggregation_executor_pool.size();
+      local_id = (++(instance()[gpu_id].current_interface)) % // increment interface
+                 instance()[gpu_id].aggregation_executor_pool.size();
       ret =
-          instance.aggregation_executor_pool[local_id].request_executor_slice();
+          instance()[gpu_id].aggregation_executor_pool[local_id].request_executor_slice();
       if (ret.has_value()) {
         return ret;
       }
@@ -1044,12 +1049,15 @@ class aggregation_pool {
     } while (abort_counter <= abort_number);
     // Everything's busy -> create new aggregation executor (growing pool) OR
     // return empty optional
-    if (instance.growing_pool) {
-      instance.aggregation_executor_pool.emplace_back(
-          instance.slices_per_executor, instance.mode);
-      instance.current_interface = instance.aggregation_executor_pool.size() - 1;
-      assert(instance.aggregation_executor_pool.size() < 20480);
-      ret = instance.aggregation_executor_pool[instance.current_interface].request_executor_slice();
+    if (instance()[gpu_id].growing_pool) {
+      instance()[gpu_id].aggregation_executor_pool.emplace_back(
+          instance()[gpu_id].slices_per_executor, instance()[gpu_id].mode);
+      instance()[gpu_id].current_interface =
+          instance()[gpu_id].aggregation_executor_pool.size() - 1;
+      assert(instance()[gpu_id].aggregation_executor_pool.size() < 20480);
+      ret = instance()[gpu_id]
+                .aggregation_executor_pool[instance()[gpu_id].current_interface]
+                .request_executor_slice();
       assert(ret.has_value()); // fresh executor -- should always have slices
                                // available
     }
@@ -1066,9 +1074,13 @@ class aggregation_pool {
 private:
   /// Required for dealing with adding elements to the deque of
   /// aggregated_executors
-  static inline aggregation_mutex_t pool_mutex;
+  aggregation_mutex_t pool_mutex;
   /// Global access instance
-  static inline aggregation_pool instance{};
+  static std::unique_ptr<aggregation_pool[]>& instance(void) {
+    static std::unique_ptr<aggregation_pool[]> pool_instances{
+        new aggregation_pool[max_number_gpus]};
+    return pool_instances;
+  }
   aggregation_pool() = default;
 
 public:
diff --git a/include/buffer_manager.hpp b/include/buffer_manager.hpp
index ddbffb70..8907a0e5 100644
--- a/include/buffer_manager.hpp
+++ b/include/buffer_manager.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2020-2021 Gregor Daiß
+// Copyright (c) 2020-2023 Gregor Daiß
 //
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
@@ -23,7 +23,7 @@
 #ifndef CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS
 #pragma message                                                                \
 "Warning: CPPuddle build with HPX support but without HPX-aware allocators enabled. \
-For better performance configure CPPuddle with the cmake option CPPUDDLE_WITH_HPX_AWARE_ALLOCATORS=ON !"
+For better performance configure CPPuddle with CPPUDDLE_WITH_HPX_AWARE_ALLOCATORS=ON!"
 #else
 // include runtime to get HPX thread IDs required for the HPX-aware allocators
 #include <hpx/include/runtime.hpp>
@@ -39,16 +39,11 @@ For better performance configure CPPuddle with the cmake option CPPUDDLE_WITH_HP
 #include <boost/core/demangle.hpp>
 #endif
 
+#include "../include/detail/config.hpp"
+
 namespace recycler {
-constexpr size_t number_instances = 128;
 namespace detail {
 
-#if defined(CPPUDDLE_HAVE_HPX) && defined(CPPUDDLE_HAVE_HPX_MUTEX)
-using mutex_t = hpx::spinlock;
-#else
-using mutex_t = std::mutex;
-#endif
-
 class buffer_recycler {
   // Public interface
 public:
@@ -57,7 +52,7 @@ class buffer_recycler {
 // Warn about suboptimal performance without recycling
 #pragma message                                                                \
 "Warning: Building without buffer recycling! Use only for performance testing! \
-For better performance configure CPPuddle with the cmake option CPPUDDLE_DEACTIVATE_BUFFER_RECYCLING=OFF !"
+For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCLING=OFF!"
 
   template <typename T, typename Host_Allocator>
   static T *get(size_t number_elements, bool manage_content_lifetime = false,
@@ -258,7 +253,9 @@ For better performance configure CPPuddle with the cmake option CPPUDDLE_DEACTIV
         return buffer;
       } catch (std::bad_alloc &e) {
         // not enough memory left! Cleanup and attempt again:
-        std::cerr << "Not enough memory left. Cleaning up unused buffers now..." << std::endl;
+        std::cerr 
+          << "Not enough memory left. Cleaning up unused buffers now..." 
+          << std::endl;
         buffer_recycler::clean_unused_buffers();
         std::cerr << "Buffers cleaned! Try allocation again..." << std::endl;
 
@@ -370,7 +367,6 @@ For better performance configure CPPuddle with the cmake option CPPUDDLE_DEACTIV
     buffer_manager&
     operator=(buffer_manager<T, Host_Allocator> &&other) = delete;
     static std::unique_ptr<buffer_manager[]>& instance(void) {
-      /* static std::array<buffer_manager, number_instances> instances{{}}; */
       static std::unique_ptr<buffer_manager[]> instances{
           new buffer_manager[number_instances]};
       return instances;
@@ -603,7 +599,7 @@ struct aggressive_recycle_allocator {
 // Warn about suboptimal performance without recycling
 #pragma message                                                                \
 "Warning: Building without content reusage for aggressive allocators! \
-For better performance configure with the cmake option CPPUDDLE_DEACTIVATE_AGGRESSIVE_ALLOCATORS=OFF !"
+For better performance configure with CPPUDDLE_DEACTIVATE_AGGRESSIVE_ALLOCATORS=OFF !"
   template <typename... Args>
   inline void construct(T *p, Args... args) noexcept {
     ::new (static_cast<void *>(p)) T(std::forward<Args>(args)...);
diff --git a/include/cuda_buffer_util.hpp b/include/cuda_buffer_util.hpp
index d2d0f596..5d0cb156 100644
--- a/include/cuda_buffer_util.hpp
+++ b/include/cuda_buffer_util.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2020-2021 Gregor Daiß
+// Copyright (c) 2020-2023 Gregor Daiß
 //
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
@@ -7,6 +7,7 @@
 #define CUDA_BUFFER_UTIL_HPP
 
 #include "buffer_manager.hpp"
+#include "detail/config.hpp"
 
 #include <cuda_runtime.h>
 #include <stdexcept>
@@ -22,6 +23,7 @@ template <class T> struct cuda_pinned_allocator {
   template <class U>
   explicit cuda_pinned_allocator(cuda_pinned_allocator<U> const &) noexcept {}
   T *allocate(std::size_t n) {
+    cudaSetDevice(get_device_id());
     T *data;
     cudaError_t error =
         cudaMallocHost(reinterpret_cast<void **>(&data), n * sizeof(T));
@@ -62,6 +64,7 @@ template <class T> struct cuda_device_allocator {
   template <class U>
   explicit cuda_device_allocator(cuda_device_allocator<U> const &) noexcept {}
   T *allocate(std::size_t n) {
+    cudaSetDevice(get_device_id());
     T *data;
     cudaError_t error = cudaMalloc(&data, n * sizeof(T));
     if (error != cudaSuccess) {
@@ -114,29 +117,14 @@ struct cuda_device_buffer {
     device_side_buffer =
         recycle_allocator_cuda_device<T>{}.allocate(number_of_elements);
   }
+  // TODO deprecate and remove gpu_id
   explicit cuda_device_buffer(size_t number_of_elements, size_t gpu_id)
       : gpu_id(gpu_id), number_of_elements(number_of_elements), set_id(true) {
-#if defined(CPPUDDLE_HAVE_MULTIGPU) 
-    cudaSetDevice(gpu_id);
-#else
-    // TODO It would be better to have separate method for this but it would change the interface
-    // This will have to do for some testing. If it's worth it, add separate method without cudaSetDevice
-    // Allows for testing without any changes to other projects 
-    assert(gpu_id == 0); 
-#endif
+    assert(gpu_id == 0);
     device_side_buffer =
         recycle_allocator_cuda_device<T>{}.allocate(number_of_elements);
   }
   ~cuda_device_buffer() {
-#if defined(CPPUDDLE_HAVE_MULTIGPU) 
-    if (set_id)
-      cudaSetDevice(gpu_id);
-#else
-    // TODO It would be better to have separate method for this but it would change the interface
-    // This will have to do for some testing. If it's worth it, add separate method without cudaSetDevice
-    // Allows for testing without any changes to other projects 
-    assert(gpu_id == 0); 
-#endif
     recycle_allocator_cuda_device<T>{}.deallocate(device_side_buffer,
                                                   number_of_elements);
   }
@@ -160,29 +148,14 @@ struct cuda_aggregated_device_buffer {
     device_side_buffer =
         recycle_allocator_cuda_device<T>{}.allocate(number_of_elements);
   }
+  // TODO deprecate and remove gpu_id
   explicit cuda_aggregated_device_buffer(size_t number_of_elements, size_t gpu_id, Host_Allocator &alloc)
       : gpu_id(gpu_id), number_of_elements(number_of_elements), set_id(true), alloc(alloc) {
-#if defined(CPPUDDLE_HAVE_MULTIGPU) 
-    cudaSetDevice(gpu_id);
-#else
-    // TODO It would be better to have separate method for this but it would change the interface
-    // This will have to do for some testing. If it's worth it, add separate method without cudaSetDevice
-    // Allows for testing without any changes to other projects 
-    assert(gpu_id == 0); 
-#endif
+    assert(gpu_id == 0);
     device_side_buffer =
         alloc.allocate(number_of_elements);
   }
   ~cuda_aggregated_device_buffer() {
-#if defined(CPPUDDLE_HAVE_MULTIGPU) 
-    if (set_id)
-      cudaSetDevice(gpu_id);
-#else
-    // TODO It would be better to have separate method for this but it would change the interface
-    // This will have to do for some testing. If it's worth it, add separate method without cudaSetDevice
-    // Allows for testing without any changes to other projects 
-    assert(gpu_id == 0); 
-#endif
     alloc.deallocate(device_side_buffer,
                                                   number_of_elements);
   }
diff --git a/include/detail/config.hpp b/include/detail/config.hpp
new file mode 100644
index 00000000..fd15cfc2
--- /dev/null
+++ b/include/detail/config.hpp
@@ -0,0 +1,64 @@
+// Copyright (c) 2023-2023 Gregor Daiß
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef CPPUDDLE_CONFIG_HPP
+#define CPPUDDLE_CONFIG_HPP
+
+// Mutex configuration
+//
+#if defined(CPPUDDLE_HAVE_HPX) && defined(CPPUDDLE_HAVE_HPX_MUTEX)
+#include <hpx/mutex.hpp>
+using mutex_t = hpx::spinlock;
+using aggregation_mutex_t = hpx::spinlock;
+#else
+#include <mutex>
+using mutex_t = std::mutex;
+using aggregation_mutex_t = std::mutex;
+#endif
+
+// HPX-aware configuration
+//
+#ifdef CPPUDDLE_HAVE_HPX
+#ifndef CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS
+#pragma message                                                                \
+"Warning: CPPuddle build with HPX support but without HPX-aware allocators enabled. \
+For better performance configure CPPuddle with CPPUDDLE_WITH_HPX_AWARE_ALLOCATORS=ON!"
+#else
+// include runtime to get HPX thread IDs required for the HPX-aware allocators
+#include <hpx/include/runtime.hpp>
+#endif
+#endif
+
+// Recycling configuration
+// TODO Add warnings here
+
+// Aggressive recycling configuration
+// TODO Add warning here
+
+// Aggregation Debug configuration
+// TODO Add warning here
+
+// Thread and MultiGPU configuration
+//
+constexpr size_t number_instances = CPPUDDLE_MAX_NUMBER_WORKERS;
+constexpr size_t max_number_gpus = CPPUDDLE_MAX_NUMBER_GPUS;
+#ifndef CPPUDDLE_HAVE_HPX
+static_assert(max_number_gpus == 1, "Non HPX builds do not support multigpu");
+#endif
+static_assert(number_instances >= max_number_gpus);
+static_assert(max_number_gpus > 0);
+static_assert(number_instances > 0);
+constexpr size_t instances_per_gpu = number_instances / max_number_gpus;
+
+/// Uses HPX thread information to determine which GPU should be used
+size_t get_device_id(void) {
+#if defined(CPPUDDLE_HAVE_HPX) 
+    return hpx::get_worker_thread_num() / instances_per_gpu; 
+#else
+    return 0;
+#endif
+}
+
+#endif
diff --git a/include/hip_buffer_util.hpp b/include/hip_buffer_util.hpp
index 5a4209c1..c33566e5 100644
--- a/include/hip_buffer_util.hpp
+++ b/include/hip_buffer_util.hpp
@@ -22,6 +22,7 @@ template <class T> struct hip_pinned_allocator {
   template <class U>
   explicit hip_pinned_allocator(hip_pinned_allocator<U> const &) noexcept {}
   T *allocate(std::size_t n) {
+    hipSetDevice(get_device_id());
     T *data;
     // hipError_t error =
     //     hipMallocHost(reinterpret_cast<void **>(&data), n * sizeof(T));
@@ -68,6 +69,7 @@ template <class T> struct hip_device_allocator {
   template <class U>
   explicit hip_device_allocator(hip_device_allocator<U> const &) noexcept {}
   T *allocate(std::size_t n) {
+    hipSetDevice(get_device_id());
     T *data;
     hipError_t error = hipMalloc(&data, n * sizeof(T));
     if (error != hipSuccess) {
@@ -123,16 +125,10 @@ struct hip_device_buffer {
   }
   explicit hip_device_buffer(size_t number_of_elements, size_t gpu_id)
       : gpu_id(gpu_id), number_of_elements(number_of_elements), set_id(true) {
-
-    // TODO Fix Multi GPU support
-    // hipSetDevice(gpu_id);
     device_side_buffer =
         recycle_allocator_hip_device<T>{}.allocate(number_of_elements);
   }
   ~hip_device_buffer() {
-    // TODO Fix Multi GPU support
-    // if (set_id)
-    //   hipSetDevice(gpu_id);
     recycle_allocator_hip_device<T>{}.deallocate(device_side_buffer,
                                                   number_of_elements);
   }
@@ -158,27 +154,12 @@ struct hip_aggregated_device_buffer {
   }
   explicit hip_aggregated_device_buffer(size_t number_of_elements, size_t gpu_id, Host_Allocator &alloc)
       : gpu_id(gpu_id), number_of_elements(number_of_elements), set_id(true), alloc(alloc) {
-#if defined(CPPUDDLE_HAVE_MULTIGPU) 
-    hipSetDevice(gpu_id);
-#else
-    // TODO It would be better to have separate method for this but it would change the interface
-    // This will have to do for some testing. If it's worth it, add separate method without hipSetDevice
-    // Allows for testing without any changes to other projects 
     assert(gpu_id == 0); 
-#endif
     device_side_buffer =
         alloc.allocate(number_of_elements);
   }
   ~hip_aggregated_device_buffer() {
-#if defined(CPPUDDLE_HAVE_MULTIGPU) 
-    if (set_id)
-      hipSetDevice(gpu_id);
-#else
-    // TODO It would be better to have separate method for this but it would change the interface
-    // This will have to do for some testing. If it's worth it, add separate method without hipSetDevice
-    // Allows for testing without any changes to other projects 
     assert(gpu_id == 0); 
-#endif
     alloc.deallocate(device_side_buffer,
                                                   number_of_elements);
   }
diff --git a/include/stream_manager.hpp b/include/stream_manager.hpp
index 1d48bcd6..87740d2d 100644
--- a/include/stream_manager.hpp
+++ b/include/stream_manager.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2020-2021 Gregor Daiß
+// Copyright (c) 2020-2023 Gregor Daiß
 //
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
@@ -7,28 +7,25 @@
 #define STREAM_MANAGER_HPP
 
 #include <algorithm>
+#include <array>
 #include <cassert>
 #include <deque>
 #include <iostream>
 #include <memory>
 #include <mutex>
 #include <queue>
+#include <tuple>
 #include <type_traits>
 
-#if defined(CPPUDDLE_HAVE_HPX) && defined(CPPUDDLE_HAVE_HPX_MUTEX)
-// For builds with The HPX mutex
-#include <hpx/mutex.hpp>
-#endif
-
-#if defined(CPPUDDLE_HAVE_HPX) && defined(CPPUDDLE_HAVE_HPX_MUTEX)
-using mutex_t = hpx::spinlock;
-#else
-using mutex_t = std::mutex;
-#endif
+#include "../include/detail/config.hpp"
 
-//#include <cuda_runtime.h>
-// #include <hpx/compute/cuda/target.hpp>
-// #include <hpx/include/compute.hpp>
+/// Turns a std::array_mutex into an scoped lock
+template<typename mutex_array_t>
+auto make_scoped_lock_from_array(mutex_array_t& mutexes)
+{
+    return std::apply([](auto&... mutexes) { return std::scoped_lock{mutexes...}; }, 
+                      mutexes);
+}
 
 template <class Interface> class round_robin_pool {
 private:
@@ -38,10 +35,10 @@ template <class Interface> class round_robin_pool {
 
 public:
   template <typename... Ts>
-  explicit round_robin_pool(size_t number_of_streams, Ts &&... executor_args) {
+  round_robin_pool(size_t number_of_streams, Ts... executor_args) {
     ref_counters.reserve(number_of_streams);
     for (int i = 0; i < number_of_streams; i++) {
-      pool.emplace_back(std::forward<Ts>(executor_args)...);
+      pool.emplace_back(executor_args...);
       ref_counters.emplace_back(0);
     }
   }
@@ -74,11 +71,11 @@ template <class Interface> class priority_pool {
   std::vector<size_t> priorities{};   // Ref counters
 public:
   template <typename... Ts>
-  explicit priority_pool(size_t number_of_streams, Ts &&... executor_args) {
+  priority_pool(size_t number_of_streams, Ts... executor_args) {
     ref_counters.reserve(number_of_streams);
     priorities.reserve(number_of_streams);
     for (auto i = 0; i < number_of_streams; i++) {
-      pool.emplace_back(std::forward<Ts>(executor_args)...);
+      pool.emplace_back(executor_args...);
       ref_counters.emplace_back(0);
       priorities.emplace_back(i);
     }
@@ -120,11 +117,11 @@ template <class Interface, class Pool> class multi_gpu_round_robin_pool {
 public:
   template <typename... Ts>
   multi_gpu_round_robin_pool(size_t number_of_streams, int number_of_gpus,
-                             Ts &&... executor_args)
+                             Ts... executor_args)
       : streams_per_gpu{number_of_streams} {
     for (auto gpu_id = 0; gpu_id < number_of_gpus; gpu_id++) {
       pool.push_back(std::make_tuple(
-          Pool(number_of_streams, gpu_id, std::forward<Ts>(executor_args)...),
+          Pool(number_of_streams, gpu_id, executor_args...),
           0));
     }
   }
@@ -175,7 +172,7 @@ template <class Interface, class Pool> class priority_pool_multi_gpu {
 public:
   template <typename... Ts>
   priority_pool_multi_gpu(size_t number_of_streams, int number_of_gpus,
-                          Ts &&... executor_args)
+                          Ts... executor_args)
       : streams_per_gpu(number_of_streams) {
     ref_counters.reserve(number_of_gpus);
     priorities.reserve(number_of_gpus);
@@ -183,7 +180,7 @@ template <class Interface, class Pool> class priority_pool_multi_gpu {
       priorities.emplace_back(gpu_id);
       ref_counters.emplace_back(0);
       gpu_interfaces.emplace_back(streams_per_gpu, gpu_id,
-                                  std::forward<Ts>(executor_args)...);
+                                  executor_args...);
     }
   }
   // return a tuple with the interface and its index (to release it later)
@@ -222,33 +219,41 @@ template <class Interface, class Pool> class priority_pool_multi_gpu {
 class stream_pool {
 public:
   template <class Interface, class Pool, typename... Ts>
-  static void init(size_t number_of_streams, Ts &&... executor_args) {
-    stream_pool_implementation<Interface, Pool>::init(
-        number_of_streams, std::forward<Ts>(executor_args)...);
-  }
+  static void init(size_t number_of_streams, Ts ... executor_args) {
+    stream_pool_implementation<Interface, Pool>::init(number_of_streams,
+                                                      executor_args...);
+}
   template <class Interface, class Pool> static void cleanup() {
     stream_pool_implementation<Interface, Pool>::cleanup();
   }
   template <class Interface, class Pool>
   static std::tuple<Interface &, size_t> get_interface() {
-    return stream_pool_implementation<Interface, Pool>::get_interface();
+    return stream_pool_implementation<Interface, Pool>::get_interface(get_device_id());
   }
   template <class Interface, class Pool>
   static void release_interface(size_t index) noexcept {
-    stream_pool_implementation<Interface, Pool>::release_interface(index);
+    stream_pool_implementation<Interface, Pool>::release_interface(index,
+        get_device_id());
   }
   template <class Interface, class Pool>
   static bool interface_available(size_t load_limit) noexcept {
     return stream_pool_implementation<Interface, Pool>::interface_available(
-        load_limit);
+        load_limit, get_device_id());
   }
   template <class Interface, class Pool>
   static size_t get_current_load() noexcept {
-    return stream_pool_implementation<Interface, Pool>::get_current_load();
+    return stream_pool_implementation<Interface, Pool>::get_current_load(
+        get_device_id());
   }
+  // TODO deprecated! Remove...
   template <class Interface, class Pool>
   static size_t get_next_device_id() noexcept {
-    return stream_pool_implementation<Interface, Pool>::get_next_device_id();
+    return stream_pool_implementation<Interface, Pool>::get_next_device_id(get_device_id());
+  }
+
+  template <class Interface, class Pool>
+  static size_t set_device_selector(std::function<void(size_t)> select_gpu_function) {
+    return stream_pool_implementation<Interface, Pool>::set_device_selector(select_gpu_function);
   }
 
 private:
@@ -258,64 +263,66 @@ class stream_pool {
   template <class Interface, class Pool> class stream_pool_implementation {
   public:
     template <typename... Ts>
-    static void init(size_t number_of_streams, Ts &&... executor_args) {
-      // TODO(daissgr) What should happen if the instance already exists?
-      // warning?
-      if (!pool_instance && number_of_streams > 0) {
-        // NOLINTNEXTLINE(cppcoreguidelines-owning-memory)
-        pool_instance.reset(new stream_pool_implementation());
-        // NOLINTNEXTLINE(cppcoreguidelines-owning-memory)
-        pool_instance->streampool.reset(
-            new Pool{number_of_streams, std::forward<Ts>(executor_args)...});
+    static void init(size_t number_of_streams, Ts ... executor_args) {
+      auto guard = make_scoped_lock_from_array(instance().gpu_mutexes);
+      if (number_of_streams > 0) {
+        for (size_t gpu_id = 0; gpu_id < max_number_gpus; gpu_id++) {
+          instance().select_gpu_function(gpu_id);
+          instance().streampools.emplace_back(number_of_streams,
+                                              executor_args...);
+        }
       }
     }
+
+    // TODO add/rename into finalize?
     static void cleanup() {
-      std::lock_guard<mutex_t> guard(pool_mut);
-      if (pool_instance) {
-        pool_instance->streampool.reset(nullptr);
-        pool_instance.reset(nullptr);
-      }
+      auto guard = make_scoped_lock_from_array(instance().gpu_mutexes);
+      instance().streampools.clear();
     }
 
-    static std::tuple<Interface &, size_t> get_interface() {
-      std::lock_guard<mutex_t> guard(pool_mut);
-      assert(pool_instance); // should already be initialized
-      return pool_instance->streampool->get_interface();
+    static std::tuple<Interface &, size_t> get_interface(const size_t gpu_id = 0) {
+      std::lock_guard<mutex_t> guard(instance().gpu_mutexes[gpu_id]);
+      return instance().streampools[gpu_id].get_interface();
     }
-    static void release_interface(size_t index) {
-      std::lock_guard<mutex_t> guard(pool_mut);
-      assert(pool_instance); // should already be initialized
-      pool_instance->streampool->release_interface(index);
+    static void release_interface(size_t index, const size_t gpu_id = 0) {
+      std::lock_guard<mutex_t> guard(instance().gpu_mutexes[gpu_id]);
+      instance().streampools[gpu_id].release_interface(index);
     }
-    static bool interface_available(size_t load_limit) {
-      std::lock_guard<mutex_t> guard(pool_mut);
-      if (!pool_instance) {
-        return false;
-      }
-      return pool_instance->streampool->interface_available(load_limit);
+    static bool interface_available(size_t load_limit, const size_t gpu_id = 0) {
+      std::lock_guard<mutex_t> guard(instance().gpu_mutexes[gpu_id]);
+      return instance().streampools[gpu_id].interface_available(load_limit);
     }
-    static size_t get_current_load() {
-      std::lock_guard<mutex_t> guard(pool_mut);
-      if (!pool_instance) {
-        return 0;
-      }
-      assert(pool_instance); // should already be initialized
-      return pool_instance->streampool->get_current_load();
+    static size_t get_current_load(const size_t gpu_id = 0) {
+      std::lock_guard<mutex_t> guard(instance().gpu_mutexes[gpu_id]);
+      return instance().streampools[gpu_id].get_current_load();
     }
-    static size_t get_next_device_id() {
-      std::lock_guard<mutex_t> guard(pool_mut);
-      if (!pool_instance) {
-        return 0;
-      }
-      return pool_instance->streampool->get_next_device_id();
+    // TODO deprecated! Remove...
+    static size_t get_next_device_id(const size_t gpu_id = 0) {
+      std::lock_guard<mutex_t> guard(instance().gpu_mutexes[gpu_id]);
+      return instance().streampools[gpu_id].get_next_device_id();
+    }
+
+    static size_t set_device_selector(std::function<void(size_t)> select_gpu_function) {
+      auto guard = make_scoped_lock_from_array(instance().gpu_mutexes);
+      return instance().select_gpu_function = select_gpu_function;
     }
 
   private:
-    inline static std::unique_ptr<stream_pool_implementation> pool_instance{};
     stream_pool_implementation() = default;
-    inline static mutex_t pool_mut{};
+    mutex_t pool_mut{};
+    std::function<void(size_t)> select_gpu_function = [](size_t gpu_id) {
+      // By default no multi gpu support
+      assert(max_number_gpus == 1);
+      assert(gpu_id == 0);
+    };
+
+    std::deque<Pool> streampools{};
+    std::array<mutex_t, max_number_gpus> gpu_mutexes;
 
-    std::unique_ptr<Pool> streampool{nullptr};
+    static stream_pool_implementation& instance(void) {
+      static stream_pool_implementation pool_instance{};
+      return pool_instance;
+    }
 
   public:
     ~stream_pool_implementation() = default;
@@ -338,13 +345,9 @@ class stream_pool {
   stream_pool &operator=(stream_pool &&other) = delete;
 };
 
-/* template <class Interface, class Pool> */
-/* std::unique_ptr<stream_pool::stream_pool_implementation<Interface, Pool>> */
-/*     stream_pool::stream_pool_implementation<Interface, Pool>::pool_instance{}; */
-
 template <class Interface, class Pool> class stream_interface {
 public:
-  explicit stream_interface()
+  stream_interface()
       : t(stream_pool::get_interface<Interface, Pool>()),
         interface(std::get<0>(t)), interface_index(std::get<1>(t)) {}
 

From 1685f84278b80060399f8e74d09cb8c906cd2801 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gregor=20Dai=C3=9F?= <Gregor.Daiss+git@gmail.com>
Date: Sun, 11 Jun 2023 23:43:45 +0200
Subject: [PATCH 02/42] Use correct recycler instance per gpu

---
 include/aggregation_manager.hpp | 3 ++-
 include/sycl_buffer_util.hpp    | 2 ++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/include/aggregation_manager.hpp b/include/aggregation_manager.hpp
index 47803043..35139495 100644
--- a/include/aggregation_manager.hpp
+++ b/include/aggregation_manager.hpp
@@ -565,7 +565,8 @@ template <typename Executor> class Aggregated_Executor {
           // get prefered location: aka the current hpx threads location
           // Usually handy for CPU builds where we want to use the buffers
           // close to the current CPU core
-          location_id = hpx::get_worker_thread_num();
+          location_id = (hpx::get_worker_thread_num() / instances_per_gpu) * instances_per_gpu;
+          // division makes sure that we always use the same instance to store our gpu buffers.
         }
 #endif
         // Get shiny and new buffer that will be shared between all slices
diff --git a/include/sycl_buffer_util.hpp b/include/sycl_buffer_util.hpp
index 6469aa4e..61d22f8f 100644
--- a/include/sycl_buffer_util.hpp
+++ b/include/sycl_buffer_util.hpp
@@ -16,6 +16,8 @@ namespace recycler {
 
 namespace detail {
 
+static_assert(max_number_gpus == 1, "CPPuddle currently does not support MultiGPU SYCL builds!");
+
 template <class T> struct sycl_host_default_allocator {
   using value_type = T;
   sycl_host_default_allocator() noexcept = default;

From d3423b433e5a5013f60ecc4a13f43cb3c62cf637 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gregor=20Dai=C3=9F?= <Gregor.Daiss+git@gmail.com>
Date: Mon, 12 Jun 2023 00:28:15 +0200
Subject: [PATCH 03/42] Make global function inline

---
 include/detail/config.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/detail/config.hpp b/include/detail/config.hpp
index fd15cfc2..4c8eca37 100644
--- a/include/detail/config.hpp
+++ b/include/detail/config.hpp
@@ -53,7 +53,7 @@ static_assert(number_instances > 0);
 constexpr size_t instances_per_gpu = number_instances / max_number_gpus;
 
 /// Uses HPX thread information to determine which GPU should be used
-size_t get_device_id(void) {
+inline size_t get_device_id(void) {
 #if defined(CPPUDDLE_HAVE_HPX) 
     return hpx::get_worker_thread_num() / instances_per_gpu; 
 #else

From 493710f504e34dec9c16ab81723cbe5c57f177b1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gregor=20Dai=C3=9F?= <Gregor.Daiss+git@gmail.com>
Date: Mon, 12 Jun 2023 17:49:30 +0200
Subject: [PATCH 04/42] Fix default location id

---
 include/aggregation_manager.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/aggregation_manager.hpp b/include/aggregation_manager.hpp
index 35139495..2682a55c 100644
--- a/include/aggregation_manager.hpp
+++ b/include/aggregation_manager.hpp
@@ -559,7 +559,7 @@ template <typename Executor> class Aggregated_Executor {
 
         // Default location -- useful for GPU builds as we otherwise create way too
         // many different buffers for different aggregation sizes on different GPUs
-        size_t location_id = 0;
+        size_t location_id = (hpx::get_worker_thread_num() / instances_per_gpu) * instances_per_gpu;
 #ifdef CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS
         if (max_slices == 1) {
           // get prefered location: aka the current hpx threads location

From 3c1c3510d13574e24ce52524d41ea4452e3b7a21 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gregor=20Dai=C3=9F?= <Gregor.Daiss+git@gmail.com>
Date: Mon, 12 Jun 2023 19:56:31 +0200
Subject: [PATCH 05/42] Testing extra flag for multi-gpu

---
 include/cuda_buffer_util.hpp | 30 ++++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/include/cuda_buffer_util.hpp b/include/cuda_buffer_util.hpp
index 5d0cb156..bbe01349 100644
--- a/include/cuda_buffer_util.hpp
+++ b/include/cuda_buffer_util.hpp
@@ -17,13 +17,14 @@ namespace recycler {
 
 namespace detail {
 
-template <class T> struct cuda_pinned_allocator {
+template <class T, bool auto_select_device = true> struct cuda_pinned_allocator {
   using value_type = T;
   cuda_pinned_allocator() noexcept = default;
   template <class U>
   explicit cuda_pinned_allocator(cuda_pinned_allocator<U> const &) noexcept {}
   T *allocate(std::size_t n) {
-    cudaSetDevice(get_device_id());
+    if constexpr (auto_select_device)
+      cudaSetDevice(get_device_id());
     T *data;
     cudaError_t error =
         cudaMallocHost(reinterpret_cast<void **>(&data), n * sizeof(T));
@@ -58,13 +59,14 @@ constexpr bool operator!=(cuda_pinned_allocator<T> const &,
   return false;
 }
 
-template <class T> struct cuda_device_allocator {
+template <class T, bool auto_select_device = true> struct cuda_device_allocator {
   using value_type = T;
   cuda_device_allocator() noexcept = default;
   template <class U>
   explicit cuda_device_allocator(cuda_device_allocator<U> const &) noexcept {}
   T *allocate(std::size_t n) {
-    cudaSetDevice(get_device_id());
+    if constexpr (auto_select_device)
+      cudaSetDevice(get_device_id());
     T *data;
     cudaError_t error = cudaMalloc(&data, n * sizeof(T));
     if (error != cudaSuccess) {
@@ -87,25 +89,25 @@ template <class T> struct cuda_device_allocator {
     }
   }
 };
-template <class T, class U>
-constexpr bool operator==(cuda_device_allocator<T> const &,
-                          cuda_device_allocator<U> const &) noexcept {
+template <class T, class U, bool auto_select_T, bool auto_select_U>
+constexpr bool operator==(cuda_device_allocator<T, auto_select_T> const &,
+                          cuda_device_allocator<U, auto_select_U> const &) noexcept {
   return true;
 }
-template <class T, class U>
-constexpr bool operator!=(cuda_device_allocator<T> const &,
-                          cuda_device_allocator<U> const &) noexcept {
+template <class T, class U, bool auto_select_T, bool auto_select_U>
+constexpr bool operator!=(cuda_device_allocator<T, auto_select_T> const &,
+                          cuda_device_allocator<U, auto_select_U> const &) noexcept {
   return false;
 }
 
 } // end namespace detail
 
-template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
+template <typename T, bool auto_select_device = true, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
 using recycle_allocator_cuda_host =
-    detail::aggressive_recycle_allocator<T, detail::cuda_pinned_allocator<T>>;
-template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
+    detail::aggressive_recycle_allocator<T, detail::cuda_pinned_allocator<T, auto_select_device>>;
+template <typename T, bool auto_select_device = true,  std::enable_if_t<std::is_trivial<T>::value, int> = 0>
 using recycle_allocator_cuda_device =
-    detail::recycle_allocator<T, detail::cuda_device_allocator<T>>;
+    detail::recycle_allocator<T, detail::cuda_device_allocator<T, auto_select_device>>;
 
 template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
 struct cuda_device_buffer {

From 5a44383f57f4bf9c20e983d08e27319e3d81cc3d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gregor=20Dai=C3=9F?= <Gregor.Daiss+git@gmail.com>
Date: Mon, 12 Jun 2023 16:40:05 -0500
Subject: [PATCH 06/42] Remove obsolete multigpu stream pools

---
 include/stream_manager.hpp | 137 ++++++++-----------------------------
 tests/stream_test.cpp      |  93 +------------------------
 2 files changed, 30 insertions(+), 200 deletions(-)

diff --git a/include/stream_manager.hpp b/include/stream_manager.hpp
index 87740d2d..ec1eeeda 100644
--- a/include/stream_manager.hpp
+++ b/include/stream_manager.hpp
@@ -107,114 +107,6 @@ template <class Interface> class priority_pool {
   }
 };
 
-template <class Interface, class Pool> class multi_gpu_round_robin_pool {
-private:
-  using gpu_entry = std::tuple<Pool, size_t>; // interface, ref counter
-  std::deque<gpu_entry> pool{};
-  size_t current_interface{0};
-  size_t streams_per_gpu{0};
-
-public:
-  template <typename... Ts>
-  multi_gpu_round_robin_pool(size_t number_of_streams, int number_of_gpus,
-                             Ts... executor_args)
-      : streams_per_gpu{number_of_streams} {
-    for (auto gpu_id = 0; gpu_id < number_of_gpus; gpu_id++) {
-      pool.push_back(std::make_tuple(
-          Pool(number_of_streams, gpu_id, executor_args...),
-          0));
-    }
-  }
-
-  // return a tuple with the interface and its index (to release it later)
-  std::tuple<Interface &, size_t> get_interface() {
-    size_t last_interface = current_interface;
-    current_interface = (current_interface + 1) % pool.size();
-    std::get<1>(pool[last_interface])++;
-    size_t gpu_offset = last_interface * streams_per_gpu;
-    std::tuple<Interface &, size_t> stream_entry =
-        std::get<0>(pool[last_interface]).get_interface();
-    std::get<1>(stream_entry) += gpu_offset;
-    return stream_entry;
-  }
-  void release_interface(size_t index) {
-    size_t gpu_index = index / streams_per_gpu;
-    size_t stream_index = index % streams_per_gpu;
-    std::get<1>(pool[gpu_index])--;
-    std::get<0>(pool[gpu_index]).release_interface(stream_index);
-  }
-  bool interface_available(size_t load_limit) {
-    auto &current_min_gpu = std::get<0>(*(std::min_element(
-        std::begin(pool), std::end(pool),
-        [](const gpu_entry &first, const gpu_entry &second) -> bool {
-          return std::get<1>(first) < std::get<1>(second);
-        })));
-    return current_min_gpu.interface_available(load_limit);
-  }
-  size_t get_current_load() {
-    auto &current_min_gpu = std::get<0>(*(std::min_element(
-        std::begin(pool), std::end(pool),
-        [](const gpu_entry &first, const gpu_entry &second) -> bool {
-          return std::get<1>(first) < std::get<1>(second);
-        })));
-    return current_min_gpu.get_current_load();
-  }
-  size_t get_next_device_id() { return current_interface; }
-};
-
-template <class Interface, class Pool> class priority_pool_multi_gpu {
-private:
-  std::vector<size_t> priorities{};
-  std::vector<size_t> ref_counters{};
-  std::deque<Pool> gpu_interfaces{};
-  size_t streams_per_gpu{0};
-
-public:
-  template <typename... Ts>
-  priority_pool_multi_gpu(size_t number_of_streams, int number_of_gpus,
-                          Ts... executor_args)
-      : streams_per_gpu(number_of_streams) {
-    ref_counters.reserve(number_of_gpus);
-    priorities.reserve(number_of_gpus);
-    for (auto gpu_id = 0; gpu_id < number_of_gpus; gpu_id++) {
-      priorities.emplace_back(gpu_id);
-      ref_counters.emplace_back(0);
-      gpu_interfaces.emplace_back(streams_per_gpu, gpu_id,
-                                  executor_args...);
-    }
-  }
-  // return a tuple with the interface and its index (to release it later)
-  std::tuple<Interface &, size_t> get_interface() {
-    auto gpu = priorities[0];
-    ref_counters[gpu]++;
-    std::make_heap(std::begin(priorities), std::end(priorities),
-                   [this](const size_t &first, const size_t &second) -> bool {
-                     return ref_counters[first] > ref_counters[second];
-                   });
-    size_t gpu_offset = gpu * streams_per_gpu;
-    auto stream_entry = gpu_interfaces[gpu].get_interface();
-    std::get<1>(stream_entry) += gpu_offset;
-    return stream_entry;
-  }
-  void release_interface(size_t index) {
-    size_t gpu_index = index / streams_per_gpu;
-    size_t stream_index = index % streams_per_gpu;
-    ref_counters[gpu_index]--;
-    std::make_heap(std::begin(priorities), std::end(priorities),
-                   [this](const size_t &first, const size_t &second) -> bool {
-                     return ref_counters[first] > ref_counters[second];
-                   });
-    gpu_interfaces[gpu_index].release_interface(stream_index);
-  }
-  bool interface_available(size_t load_limit) {
-    return gpu_interfaces[priorities[0]].interface_available(load_limit);
-  }
-  size_t get_current_load() {
-    return gpu_interfaces[priorities[0]].get_current_load();
-  }
-  size_t get_next_device_id() { return priorities[0]; }
-};
-
 /// Access/Concurrency Control for stream pool implementation
 class stream_pool {
 public:
@@ -262,8 +154,18 @@ class stream_pool {
 private:
   template <class Interface, class Pool> class stream_pool_implementation {
   public:
+    /// Deprecated! Use init_on_all_gpu or init_on_gpu
     template <typename... Ts>
     static void init(size_t number_of_streams, Ts ... executor_args) {
+      static_assert(max_number_gpus == 1, "deprecated stream_pool::init does not support multigpu");
+      auto guard = make_scoped_lock_from_array(instance().gpu_mutexes);
+          instance().streampools.emplace_back(number_of_streams,
+                                              executor_args...);
+    }
+
+    /// Multi-GPU init where executors / interfaces on all GPUs are initialized with the same arguments
+    template <typename... Ts>
+    static void init_on_all_gpus(size_t number_of_streams, Ts ... executor_args) {
       auto guard = make_scoped_lock_from_array(instance().gpu_mutexes);
       if (number_of_streams > 0) {
         for (size_t gpu_id = 0; gpu_id < max_number_gpus; gpu_id++) {
@@ -274,36 +176,55 @@ class stream_pool {
       }
     }
 
+    /// Per-GPU init allowing for different init parameters depending on the GPU 
+    /// (useful for executor that expect an GPU-id during construction)
+    template <typename... Ts>
+    static void init_on_gpu(size_t gpu_id, size_t number_of_streams, Ts ... executor_args) {
+      auto guard = make_scoped_lock_from_array(instance().gpu_mutexes);
+      if (number_of_streams > 0) {
+        instance().select_gpu_function(gpu_id);
+        instance().streampools.emplace_back(number_of_streams, 
+                                            executor_args...);
+      }
+    }
+
     // TODO add/rename into finalize?
     static void cleanup() {
       auto guard = make_scoped_lock_from_array(instance().gpu_mutexes);
+      assert(instance().streampools.size() == max_number_gpus);
       instance().streampools.clear();
     }
 
     static std::tuple<Interface &, size_t> get_interface(const size_t gpu_id = 0) {
       std::lock_guard<mutex_t> guard(instance().gpu_mutexes[gpu_id]);
+      assert(instance().streampools.size() == max_number_gpus);
       return instance().streampools[gpu_id].get_interface();
     }
     static void release_interface(size_t index, const size_t gpu_id = 0) {
       std::lock_guard<mutex_t> guard(instance().gpu_mutexes[gpu_id]);
+      assert(instance().streampools.size() == max_number_gpus);
       instance().streampools[gpu_id].release_interface(index);
     }
     static bool interface_available(size_t load_limit, const size_t gpu_id = 0) {
       std::lock_guard<mutex_t> guard(instance().gpu_mutexes[gpu_id]);
+      assert(instance().streampools.size() == max_number_gpus);
       return instance().streampools[gpu_id].interface_available(load_limit);
     }
     static size_t get_current_load(const size_t gpu_id = 0) {
       std::lock_guard<mutex_t> guard(instance().gpu_mutexes[gpu_id]);
+      assert(instance().streampools.size() == max_number_gpus);
       return instance().streampools[gpu_id].get_current_load();
     }
     // TODO deprecated! Remove...
     static size_t get_next_device_id(const size_t gpu_id = 0) {
       std::lock_guard<mutex_t> guard(instance().gpu_mutexes[gpu_id]);
+      assert(instance().streampools.size() == max_number_gpus);
       return instance().streampools[gpu_id].get_next_device_id();
     }
 
     static size_t set_device_selector(std::function<void(size_t)> select_gpu_function) {
       auto guard = make_scoped_lock_from_array(instance().gpu_mutexes);
+      assert(instance().streampools.size() == max_number_gpus);
       return instance().select_gpu_function = select_gpu_function;
     }
 
diff --git a/tests/stream_test.cpp b/tests/stream_test.cpp
index e7010d0f..96599759 100644
--- a/tests/stream_test.cpp
+++ b/tests/stream_test.cpp
@@ -31,30 +31,9 @@ int main(int argc, char *argv[]) {
   test_pool_ref_counting<
       hpx::cuda::experimental::cuda_executor,
       round_robin_pool<hpx::cuda::experimental::cuda_executor>>(2, 0, false);
-  test_pool_ref_counting<
-      hpx::cuda::experimental::cuda_executor,
-      multi_gpu_round_robin_pool<
-          hpx::cuda::experimental::cuda_executor,
-          round_robin_pool<hpx::cuda::experimental::cuda_executor>>>(2, 1,
-                                                                     false);
-  test_pool_ref_counting<
-      hpx::cuda::experimental::cuda_executor,
-      priority_pool_multi_gpu<
-          hpx::cuda::experimental::cuda_executor,
-          priority_pool<hpx::cuda::experimental::cuda_executor>>>(2, 1, false);
-  test_pool_ref_counting<
-      hpx::cuda::experimental::cuda_executor,
-      multi_gpu_round_robin_pool<
-          hpx::cuda::experimental::cuda_executor,
-          priority_pool<hpx::cuda::experimental::cuda_executor>>>(2, 1, false);
-  test_pool_ref_counting<
-      hpx::cuda::experimental::cuda_executor,
-      priority_pool_multi_gpu<
-          hpx::cuda::experimental::cuda_executor,
-          round_robin_pool<hpx::cuda::experimental::cuda_executor>>>(2, 1,
-                                                                     false);
   std::cout << "Finished ref counting tests!" << std::endl;
 
+
   std::cout << "Starting wrapper objects tests ..." << std::endl;
   test_pool_wrappers<hpx::cuda::experimental::cuda_executor,
                      priority_pool<hpx::cuda::experimental::cuda_executor>>(
@@ -62,62 +41,16 @@ int main(int argc, char *argv[]) {
   test_pool_wrappers<hpx::cuda::experimental::cuda_executor,
                      round_robin_pool<hpx::cuda::experimental::cuda_executor>>(
       2, 0, false);
-  test_pool_wrappers<
-      hpx::cuda::experimental::cuda_executor,
-      multi_gpu_round_robin_pool<
-          hpx::cuda::experimental::cuda_executor,
-          round_robin_pool<hpx::cuda::experimental::cuda_executor>>>(2, 1,
-                                                                     false);
-  test_pool_wrappers<
-      hpx::cuda::experimental::cuda_executor,
-      priority_pool_multi_gpu<
-          hpx::cuda::experimental::cuda_executor,
-          priority_pool<hpx::cuda::experimental::cuda_executor>>>(2, 1, false);
-
-  test_pool_wrappers<
-      hpx::cuda::experimental::cuda_executor,
-      multi_gpu_round_robin_pool<
-          hpx::cuda::experimental::cuda_executor,
-          priority_pool<hpx::cuda::experimental::cuda_executor>>>(2, 1, false);
-  test_pool_wrappers<
-      hpx::cuda::experimental::cuda_executor,
-      priority_pool_multi_gpu<
-          hpx::cuda::experimental::cuda_executor,
-          round_robin_pool<hpx::cuda::experimental::cuda_executor>>>(2, 1,
-                                                                     false);
   std::cout << "Finished wrapper objects tests!" << std::endl;
 
   std::cout << "Starting memcpy tests... " << std::endl;
   test_pool_memcpy<hpx::cuda::experimental::cuda_executor,
                    round_robin_pool<hpx::cuda::experimental::cuda_executor>>(
       2, 0, false);
-  test_pool_memcpy<
-      hpx::cuda::experimental::cuda_executor,
-      multi_gpu_round_robin_pool<
-          hpx::cuda::experimental::cuda_executor,
-          round_robin_pool<hpx::cuda::experimental::cuda_executor>>>(2, 1,
-                                                                     false);
   test_pool_memcpy<hpx::cuda::experimental::cuda_executor,
                    priority_pool<hpx::cuda::experimental::cuda_executor>>(
       2, 0, false);
-  test_pool_memcpy<hpx::cuda::experimental::cuda_executor,
-                   priority_pool_multi_gpu<
-                       hpx::cuda::experimental::cuda_executor,
-                       priority_pool<hpx::cuda::experimental::cuda_executor>>>(
-      2, 1, false);
 
-  // combo pool
-  test_pool_memcpy<hpx::cuda::experimental::cuda_executor,
-                   multi_gpu_round_robin_pool<
-                       hpx::cuda::experimental::cuda_executor,
-                       priority_pool<hpx::cuda::experimental::cuda_executor>>>(
-      2, 1, false);
-  test_pool_memcpy<
-      hpx::cuda::experimental::cuda_executor,
-      priority_pool_multi_gpu<
-          hpx::cuda::experimental::cuda_executor,
-          round_robin_pool<hpx::cuda::experimental::cuda_executor>>>(2, 1,
-                                                                     false);
   std::cout << "Finished memcpy tests! " << std::endl;
 
   std::cout << "Starting memcpy polling tests... " << std::endl;
@@ -127,33 +60,9 @@ int main(int argc, char *argv[]) {
     test_pool_memcpy<hpx::cuda::experimental::cuda_executor,
                      round_robin_pool<hpx::cuda::experimental::cuda_executor>>(
         2, 0, true);
-    test_pool_memcpy<
-        hpx::cuda::experimental::cuda_executor,
-        multi_gpu_round_robin_pool<
-            hpx::cuda::experimental::cuda_executor,
-            round_robin_pool<hpx::cuda::experimental::cuda_executor>>>(2, 1,
-                                                                       true);
     test_pool_memcpy<hpx::cuda::experimental::cuda_executor,
                      priority_pool<hpx::cuda::experimental::cuda_executor>>(
         2, 0, true);
-    test_pool_memcpy<
-        hpx::cuda::experimental::cuda_executor,
-        priority_pool_multi_gpu<
-            hpx::cuda::experimental::cuda_executor,
-            priority_pool<hpx::cuda::experimental::cuda_executor>>>(2, 1, true);
-
-    // combo pool
-    test_pool_memcpy<
-        hpx::cuda::experimental::cuda_executor,
-        multi_gpu_round_robin_pool<
-            hpx::cuda::experimental::cuda_executor,
-            priority_pool<hpx::cuda::experimental::cuda_executor>>>(2, 1, true);
-    test_pool_memcpy<
-        hpx::cuda::experimental::cuda_executor,
-        priority_pool_multi_gpu<
-            hpx::cuda::experimental::cuda_executor,
-            round_robin_pool<hpx::cuda::experimental::cuda_executor>>>(2, 1,
-                                                                       true);
     hpx::cuda::experimental::detail::unregister_polling(hpx::resource::get_thread_pool(0));
   }
   recycler::force_cleanup();

From 093dce2c2d18c752868c4d39488167bdff2f0dee Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gregor=20Dai=C3=9F?= <Gregor.Daiss+git@gmail.com>
Date: Mon, 12 Jun 2023 16:40:22 -0500
Subject: [PATCH 07/42] Add select device functor

---
 include/aligned_buffer_util.hpp |  8 ++++++++
 include/buffer_manager.hpp      | 18 +++++++++++++++++-
 include/cuda_buffer_util.hpp    | 15 +++++++++++++++
 3 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/include/aligned_buffer_util.hpp b/include/aligned_buffer_util.hpp
index 456420bc..d36a994a 100644
--- a/include/aligned_buffer_util.hpp
+++ b/include/aligned_buffer_util.hpp
@@ -10,6 +10,14 @@
 #include <boost/align/aligned_allocator.hpp>
 
 namespace recycler {
+namespace device_selection {
+template <typename T, size_t alignement>
+struct select_device_functor<
+    T, boost::alignment::aligned_allocator<T, alignement>> {
+  void operator()(const size_t device_id) {}
+};
+} // namespace device_selection
+
 template <typename T, std::size_t alignement,
           std::enable_if_t<std::is_trivial<T>::value, int> = 0>
 using recycle_aligned = detail::recycle_allocator<
diff --git a/include/buffer_manager.hpp b/include/buffer_manager.hpp
index 8907a0e5..31a837ed 100644
--- a/include/buffer_manager.hpp
+++ b/include/buffer_manager.hpp
@@ -42,10 +42,25 @@ For better performance configure CPPuddle with CPPUDDLE_WITH_HPX_AWARE_ALLOCATOR
 #include "../include/detail/config.hpp"
 
 namespace recycler {
+
+namespace device_selection {
+template <typename T, typename Allocator> struct select_device_functor {
+  void operator()(const size_t device_id) {
+    if constexpr (max_number_gpus > 1)
+      throw std::runtime_error(
+          "Allocators used in Multi-GPU builds need explicit Multi-GPU support "
+          "(by having a select_device_functor overload");
+  }
+};
+template <typename T> struct select_device_functor<T, std::allocator<T>> {
+  void operator()(const size_t device_id) {}
+};
+} // namespace device_selection
+
 namespace detail {
 
+
 class buffer_recycler {
-  // Public interface
 public:
 #if defined(CPPUDDLE_DEACTIVATE_BUFFER_RECYCLING)
 
@@ -239,6 +254,7 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL
 
       // No unused buffer found -> Create new one and return it
       try {
+        recycler::device_selection::select_device_functor<T, Host_Allocator>{}(location_id / number_instances); 
         Host_Allocator alloc;
         T *buffer = alloc.allocate(number_of_elements);
         instance()[location_id].buffer_map.insert(
diff --git a/include/cuda_buffer_util.hpp b/include/cuda_buffer_util.hpp
index bbe01349..587eea97 100644
--- a/include/cuda_buffer_util.hpp
+++ b/include/cuda_buffer_util.hpp
@@ -17,6 +17,8 @@ namespace recycler {
 
 namespace detail {
 
+
+
 template <class T, bool auto_select_device = true> struct cuda_pinned_allocator {
   using value_type = T;
   cuda_pinned_allocator() noexcept = default;
@@ -48,6 +50,7 @@ template <class T, bool auto_select_device = true> struct cuda_pinned_allocator
     }
   }
 };
+
 template <class T, class U>
 constexpr bool operator==(cuda_pinned_allocator<T> const &,
                           cuda_pinned_allocator<U> const &) noexcept {
@@ -100,6 +103,7 @@ constexpr bool operator!=(cuda_device_allocator<T, auto_select_T> const &,
   return false;
 }
 
+
 } // end namespace detail
 
 template <typename T, bool auto_select_device = true, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
@@ -172,5 +176,16 @@ struct cuda_aggregated_device_buffer {
   Host_Allocator &alloc;
 };
 
+namespace device_selection {
+template <typename T>
+struct select_device_functor<T, detail::cuda_pinned_allocator<T>> {
+  void operator()(const size_t device_id) { cudaSetDevice(get_device_id()); }
+};
+template <typename T>
+struct select_device_functor<T, detail::cuda_device_allocator<T>> {
+  void operator()(const size_t device_id) { cudaSetDevice(get_device_id()); }
+};
+} // namespace device_selection
+
 } // end namespace recycler
 #endif

From cd93da1beb9116288a0d0c0c6929c74ce99003ed Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gregor=20Dai=C3=9F?= <Gregor.Daiss+git@gmail.com>
Date: Mon, 12 Jun 2023 16:43:19 -0500
Subject: [PATCH 08/42] Remove previous attempt at device selector

---
 include/cuda_buffer_util.hpp | 28 ++++++++++++----------------
 1 file changed, 12 insertions(+), 16 deletions(-)

diff --git a/include/cuda_buffer_util.hpp b/include/cuda_buffer_util.hpp
index 587eea97..431876f2 100644
--- a/include/cuda_buffer_util.hpp
+++ b/include/cuda_buffer_util.hpp
@@ -19,14 +19,12 @@ namespace detail {
 
 
 
-template <class T, bool auto_select_device = true> struct cuda_pinned_allocator {
+template <class T> struct cuda_pinned_allocator {
   using value_type = T;
   cuda_pinned_allocator() noexcept = default;
   template <class U>
   explicit cuda_pinned_allocator(cuda_pinned_allocator<U> const &) noexcept {}
   T *allocate(std::size_t n) {
-    if constexpr (auto_select_device)
-      cudaSetDevice(get_device_id());
     T *data;
     cudaError_t error =
         cudaMallocHost(reinterpret_cast<void **>(&data), n * sizeof(T));
@@ -62,14 +60,12 @@ constexpr bool operator!=(cuda_pinned_allocator<T> const &,
   return false;
 }
 
-template <class T, bool auto_select_device = true> struct cuda_device_allocator {
+template <class T> struct cuda_device_allocator {
   using value_type = T;
   cuda_device_allocator() noexcept = default;
   template <class U>
   explicit cuda_device_allocator(cuda_device_allocator<U> const &) noexcept {}
   T *allocate(std::size_t n) {
-    if constexpr (auto_select_device)
-      cudaSetDevice(get_device_id());
     T *data;
     cudaError_t error = cudaMalloc(&data, n * sizeof(T));
     if (error != cudaSuccess) {
@@ -92,26 +88,26 @@ template <class T, bool auto_select_device = true> struct cuda_device_allocator
     }
   }
 };
-template <class T, class U, bool auto_select_T, bool auto_select_U>
-constexpr bool operator==(cuda_device_allocator<T, auto_select_T> const &,
-                          cuda_device_allocator<U, auto_select_U> const &) noexcept {
+template <class T, class U>
+constexpr bool operator==(cuda_device_allocator<T> const &,
+                          cuda_device_allocator<U> const &) noexcept {
   return true;
 }
-template <class T, class U, bool auto_select_T, bool auto_select_U>
-constexpr bool operator!=(cuda_device_allocator<T, auto_select_T> const &,
-                          cuda_device_allocator<U, auto_select_U> const &) noexcept {
+template <class T, class U>
+constexpr bool operator!=(cuda_device_allocator<T> const &,
+                          cuda_device_allocator<U> const &) noexcept {
   return false;
 }
 
 
 } // end namespace detail
 
-template <typename T, bool auto_select_device = true, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
+template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
 using recycle_allocator_cuda_host =
-    detail::aggressive_recycle_allocator<T, detail::cuda_pinned_allocator<T, auto_select_device>>;
-template <typename T, bool auto_select_device = true,  std::enable_if_t<std::is_trivial<T>::value, int> = 0>
+    detail::aggressive_recycle_allocator<T, detail::cuda_pinned_allocator<T>>;
+template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
 using recycle_allocator_cuda_device =
-    detail::recycle_allocator<T, detail::cuda_device_allocator<T, auto_select_device>>;
+    detail::recycle_allocator<T, detail::cuda_device_allocator<T>>;
 
 template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
 struct cuda_device_buffer {

From 13847d983458a5db271dc741cecc21bc420ef3ee Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gregor=20Dai=C3=9F?= <Gregor.Daiss+git@gmail.com>
Date: Mon, 12 Jun 2023 17:51:57 -0500
Subject: [PATCH 09/42] Sync

---
 include/buffer_manager.hpp     | 26 ++++++++++++++++++++++++++
 include/kokkos_buffer_util.hpp | 29 +++++++++++++++++++++++++----
 2 files changed, 51 insertions(+), 4 deletions(-)

diff --git a/include/buffer_manager.hpp b/include/buffer_manager.hpp
index 31a837ed..ad42698a 100644
--- a/include/buffer_manager.hpp
+++ b/include/buffer_manager.hpp
@@ -515,6 +515,12 @@ template <typename T, typename Host_Allocator> struct recycle_allocator {
     T *data = buffer_recycler::get<T, Host_Allocator>(n);
     return data;
   }
+  T *allocate(std::size_t n, std::size_t location_id) {
+    assert(location_id == 0);
+    T *data = buffer_recycler::get<T, Host_Allocator>(
+        n, true, location_id); // also initializes the buffer if it isn't reused
+    return data;
+  }
   void deallocate(T *p, std::size_t n) {
     buffer_recycler::mark_unused<T, Host_Allocator>(p, n);
   }
@@ -531,6 +537,13 @@ template <typename T, typename Host_Allocator> struct recycle_allocator {
         n, false, hpx::get_worker_thread_num());
     return data;
   }
+  T *allocate(std::size_t n, std::size_t location_id) {
+    assert(location_id >= 0 && location_id < number_instances);
+    T *data = buffer_recycler::get<T, Host_Allocator>(
+        n, true, location_id); // also initializes the buffer
+                                                // if it isn't reused
+    return data;
+  }
   void deallocate(T *p, std::size_t n) {
     buffer_recycler::mark_unused<T, Host_Allocator>(p, n, dealloc_hint);
   }
@@ -580,6 +593,12 @@ struct aggressive_recycle_allocator {
         n, true); // also initializes the buffer if it isn't reused
     return data;
   }
+  T *allocate(std::size_t n, std::size_t location_id) {
+    assert(location_id == 0);
+    T *data = buffer_recycler::get<T, Host_Allocator>(
+        n, true, location_id); // also initializes the buffer if it isn't reused
+    return data;
+  }
   void deallocate(T *p, std::size_t n) {
     buffer_recycler::mark_unused<T, Host_Allocator>(p, n);
   }
@@ -597,6 +616,13 @@ struct aggressive_recycle_allocator {
                                                 // if it isn't reused
     return data;
   }
+  T *allocate(std::size_t n, std::size_t location_id) {
+    assert(location_id >= 0 && location_id < number_instances);
+    T *data = buffer_recycler::get<T, Host_Allocator>(
+        n, true, location_id); // also initializes the buffer
+                                                // if it isn't reused
+    return data;
+  }
   void deallocate(T *p, std::size_t n) {
     buffer_recycler::mark_unused<T, Host_Allocator>(p, n, dealloc_hint);
   }
diff --git a/include/kokkos_buffer_util.hpp b/include/kokkos_buffer_util.hpp
index f78f6bbb..3d8646a8 100644
--- a/include/kokkos_buffer_util.hpp
+++ b/include/kokkos_buffer_util.hpp
@@ -80,7 +80,7 @@ class aggregated_recycled_view : public kokkos_type {
 template <typename kokkos_type, typename alloc_type, typename element_type>
 class recycled_view : public kokkos_type {
 private:
-  static alloc_type allocator;
+  alloc_type allocator;
   size_t total_elements{0};
   std::shared_ptr<element_type> data_ref_counter;
 
@@ -97,6 +97,28 @@ class recycled_view : public kokkos_type {
         data_ref_counter(this->data(), view_deleter<element_type, alloc_type>(
                                            allocator, total_elements)) {}
 
+  template <class... Args>
+  explicit recycled_view(std::size_t location_id, alloc_type alloc, Args... args)
+      : allocator(alloc), kokkos_type(
+            allocator.allocate(kokkos_type::required_allocation_size(args...) /
+                               sizeof(element_type), location_id),
+            args...),
+        total_elements(kokkos_type::required_allocation_size(args...) /
+                       sizeof(element_type)),
+        data_ref_counter(this->data(), view_deleter<element_type, alloc_type>(
+                                           allocator, total_elements)) {}
+
+  template <bool use_custom_location_id, class... Args>
+  explicit recycled_view(std::size_t location_id, alloc_type alloc, Args... args)
+      : allocator(alloc), kokkos_type(
+            allocator.allocate(kokkos_type::required_allocation_size(args...) /
+                               sizeof(element_type), location_id),
+            args...),
+        total_elements(kokkos_type::required_allocation_size(args...) /
+                       sizeof(element_type)),
+        data_ref_counter(this->data(), view_deleter<element_type, alloc_type>(
+                                           allocator, total_elements)) {}
+
   recycled_view(
       const recycled_view<kokkos_type, alloc_type, element_type> &other)
       : kokkos_type(other) {
@@ -110,7 +132,6 @@ class recycled_view : public kokkos_type {
     data_ref_counter = other.data_ref_counter;
     kokkos_type::operator=(other);
     total_elements = other.total_elements;
-    allocator.increase_usage_counter(other.data(), other.total_elements);
     return *this;
   }
 
@@ -132,8 +153,8 @@ class recycled_view : public kokkos_type {
   ~recycled_view() {  }
 };
 
-template <class kokkos_type, class alloc_type, class element_type>
-alloc_type recycled_view<kokkos_type, alloc_type, element_type>::allocator;
+/* template <class kokkos_type, class alloc_type, class element_type> */
+/* alloc_type recycled_view<kokkos_type, alloc_type, element_type>::allocator; */
 
 } // end namespace recycler
 

From 7b9e4a30cb949ec2cf9d27f5c921bd5cc0d0fe8e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gregor=20Dai=C3=9F?= <Gregor.Daiss+git@gmail.com>
Date: Tue, 13 Jun 2023 16:16:32 +0200
Subject: [PATCH 10/42] Add location specific view constructors

---
 include/buffer_manager.hpp      | 30 +++--------------------
 include/kokkos_buffer_util.hpp  | 43 +++++++++++++++++++++------------
 tests/allocator_kokkos_test.cpp |  1 +
 3 files changed, 32 insertions(+), 42 deletions(-)

diff --git a/include/buffer_manager.hpp b/include/buffer_manager.hpp
index ad42698a..5325bffb 100644
--- a/include/buffer_manager.hpp
+++ b/include/buffer_manager.hpp
@@ -501,6 +501,8 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL
 
 template <typename T, typename Host_Allocator> struct recycle_allocator {
   using value_type = T;
+  using underlying_allocator_type = Host_Allocator;
+  static_assert(std::is_same_v<value_type, typename underlying_allocator_type::value_type>);
   const std::optional<size_t> dealloc_hint;
 
 #ifndef CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS
@@ -515,12 +517,6 @@ template <typename T, typename Host_Allocator> struct recycle_allocator {
     T *data = buffer_recycler::get<T, Host_Allocator>(n);
     return data;
   }
-  T *allocate(std::size_t n, std::size_t location_id) {
-    assert(location_id == 0);
-    T *data = buffer_recycler::get<T, Host_Allocator>(
-        n, true, location_id); // also initializes the buffer if it isn't reused
-    return data;
-  }
   void deallocate(T *p, std::size_t n) {
     buffer_recycler::mark_unused<T, Host_Allocator>(p, n);
   }
@@ -537,13 +533,6 @@ template <typename T, typename Host_Allocator> struct recycle_allocator {
         n, false, hpx::get_worker_thread_num());
     return data;
   }
-  T *allocate(std::size_t n, std::size_t location_id) {
-    assert(location_id >= 0 && location_id < number_instances);
-    T *data = buffer_recycler::get<T, Host_Allocator>(
-        n, true, location_id); // also initializes the buffer
-                                                // if it isn't reused
-    return data;
-  }
   void deallocate(T *p, std::size_t n) {
     buffer_recycler::mark_unused<T, Host_Allocator>(p, n, dealloc_hint);
   }
@@ -578,6 +567,8 @@ operator!=(recycle_allocator<T, Host_Allocator> const &,
 template <typename T, typename Host_Allocator>
 struct aggressive_recycle_allocator {
   using value_type = T;
+  using underlying_allocator_type = Host_Allocator;
+  static_assert(std::is_same_v<value_type, typename underlying_allocator_type::value_type>);
   std::optional<size_t> dealloc_hint;
 
 #ifndef CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS
@@ -593,12 +584,6 @@ struct aggressive_recycle_allocator {
         n, true); // also initializes the buffer if it isn't reused
     return data;
   }
-  T *allocate(std::size_t n, std::size_t location_id) {
-    assert(location_id == 0);
-    T *data = buffer_recycler::get<T, Host_Allocator>(
-        n, true, location_id); // also initializes the buffer if it isn't reused
-    return data;
-  }
   void deallocate(T *p, std::size_t n) {
     buffer_recycler::mark_unused<T, Host_Allocator>(p, n);
   }
@@ -616,13 +601,6 @@ struct aggressive_recycle_allocator {
                                                 // if it isn't reused
     return data;
   }
-  T *allocate(std::size_t n, std::size_t location_id) {
-    assert(location_id >= 0 && location_id < number_instances);
-    T *data = buffer_recycler::get<T, Host_Allocator>(
-        n, true, location_id); // also initializes the buffer
-                                                // if it isn't reused
-    return data;
-  }
   void deallocate(T *p, std::size_t n) {
     buffer_recycler::mark_unused<T, Host_Allocator>(p, n, dealloc_hint);
   }
diff --git a/include/kokkos_buffer_util.hpp b/include/kokkos_buffer_util.hpp
index 3d8646a8..52413cce 100644
--- a/include/kokkos_buffer_util.hpp
+++ b/include/kokkos_buffer_util.hpp
@@ -7,6 +7,8 @@
 #define KOKKOS_BUFFER_UTIL_HPP
 #include <Kokkos_Core.hpp>
 #include <memory>
+#include <buffer_manager.hpp>
+#include <type_traits>
 
 namespace recycler {
 
@@ -86,8 +88,10 @@ class recycled_view : public kokkos_type {
 
 public:
   using view_type = kokkos_type;
-  template <class... Args>
-  explicit recycled_view(Args... args)
+  static_assert(std::is_same_v<element_type, typename alloc_type::value_type>);
+  template <typename... Args,
+            std::enable_if_t<sizeof...(Args) == kokkos_type::rank, bool> = true>
+  recycled_view(Args... args)
       : kokkos_type(
             allocator.allocate(kokkos_type::required_allocation_size(args...) /
                                sizeof(element_type)),
@@ -97,24 +101,33 @@ class recycled_view : public kokkos_type {
         data_ref_counter(this->data(), view_deleter<element_type, alloc_type>(
                                            allocator, total_elements)) {}
 
-  template <class... Args>
-  explicit recycled_view(std::size_t location_id, alloc_type alloc, Args... args)
-      : allocator(alloc), kokkos_type(
-            allocator.allocate(kokkos_type::required_allocation_size(args...) /
-                               sizeof(element_type), location_id),
+  template <typename... Args,
+            std::enable_if_t<sizeof...(Args) == kokkos_type::rank, bool> = true>
+  recycled_view(std::size_t location_id, Args... args)
+      : kokkos_type(
+            detail::buffer_recycler::get<
+                element_type, typename alloc_type::underlying_allocator_type>(
+                kokkos_type::required_allocation_size(args...) /
+                    sizeof(element_type),
+                false, location_id),
             args...),
         total_elements(kokkos_type::required_allocation_size(args...) /
                        sizeof(element_type)),
         data_ref_counter(this->data(), view_deleter<element_type, alloc_type>(
                                            allocator, total_elements)) {}
 
-  template <bool use_custom_location_id, class... Args>
-  explicit recycled_view(std::size_t location_id, alloc_type alloc, Args... args)
-      : allocator(alloc), kokkos_type(
-            allocator.allocate(kokkos_type::required_allocation_size(args...) /
-                               sizeof(element_type), location_id),
-            args...),
-        total_elements(kokkos_type::required_allocation_size(args...) /
+  template <
+      typename layout_t,
+      std::enable_if_t<Kokkos::is_array_layout<layout_t>::value, bool> = true>
+  recycled_view(std::size_t location_id, layout_t layout)
+      : kokkos_type(
+            detail::buffer_recycler::get<
+                element_type, typename alloc_type::underlying_allocator_type>(
+                kokkos_type::required_allocation_size(layout) /
+                    sizeof(element_type),
+                false, location_id),
+            layout),
+        total_elements(kokkos_type::required_allocation_size(layout) /
                        sizeof(element_type)),
         data_ref_counter(this->data(), view_deleter<element_type, alloc_type>(
                                            allocator, total_elements)) {}
@@ -153,8 +166,6 @@ class recycled_view : public kokkos_type {
   ~recycled_view() {  }
 };
 
-/* template <class kokkos_type, class alloc_type, class element_type> */
-/* alloc_type recycled_view<kokkos_type, alloc_type, element_type>::allocator; */
 
 } // end namespace recycler
 
diff --git a/tests/allocator_kokkos_test.cpp b/tests/allocator_kokkos_test.cpp
index de808859..e0c7fb55 100644
--- a/tests/allocator_kokkos_test.cpp
+++ b/tests/allocator_kokkos_test.cpp
@@ -81,6 +81,7 @@ int main(int argc, char *argv[]) {
   for (size_t pass = 0; pass < passes; pass++) {
     test_view my_wrapper_test1(1000);
     test_view my_wrapper_test2(1000);
+    test_view my_wrapper_test3(127, 1000); // test 1D with location id 
     double t = 2.6;
     Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::Serial>(0, 1000),
                         KOKKOS_LAMBDA(const int n) {

From d8a887108e403a47ad8bf2e9f51599674125d390 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gregor=20Dai=C3=9F?= <Gregor.Daiss+git@gmail.com>
Date: Tue, 13 Jun 2023 12:20:59 -0500
Subject: [PATCH 11/42] Fix cmake and interface change

---
 CMakeLists.txt             | 14 ++++++++++----
 include/stream_manager.hpp |  4 ++--
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5801e499..5b6bd617 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -170,6 +170,9 @@ if (CPPUDDLE_WITH_HPX)
     target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_MAX_NUMBER_GPUS=1")
     target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_MAX_NUMBER_WORKERS=1")
   endif()
+else()
+  target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_MAX_NUMBER_GPUS=1")
+  target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_MAX_NUMBER_WORKERS=1")
 endif()
 if (CPPUDDLE_WITH_COUNTERS)
   target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_HAVE_COUNTERS")
@@ -184,12 +187,15 @@ if (CPPUDDLE_WITH_HPX)
   target_compile_definitions(stream_manager INTERFACE "CPPUDDLE_HAVE_HPX")
   if(CPPUDDLE_WITH_HPX_AWARE_ALLOCATORS)
     target_compile_definitions(stream_manager INTERFACE "CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS")
-    target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_MAX_NUMBER_GPUS=${CPPUDDLE_WITH_NUMBER_GPUS}")
-    target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_MAX_NUMBER_WORKERS=${CPPUDDLE_WITH_MAX_NUMBER_WORKERS}")
+    target_compile_definitions(stream_manager INTERFACE "CPPUDDLE_MAX_NUMBER_GPUS=${CPPUDDLE_WITH_NUMBER_GPUS}")
+    target_compile_definitions(stream_manager INTERFACE "CPPUDDLE_MAX_NUMBER_WORKERS=${CPPUDDLE_WITH_MAX_NUMBER_WORKERS}")
   else()
-    target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_MAX_NUMBER_GPUS=1")
-    target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_MAX_NUMBER_WORKERS=1")
+    target_compile_definitions(stream_manager INTERFACE "CPPUDDLE_MAX_NUMBER_GPUS=1")
+    target_compile_definitions(stream_manager INTERFACE "CPPUDDLE_MAX_NUMBER_WORKERS=1")
   endif()
+else()
+  target_compile_definitions(stream_manager INTERFACE "CPPUDDLE_MAX_NUMBER_GPUS=1")
+  target_compile_definitions(stream_manager INTERFACE "CPPUDDLE_MAX_NUMBER_WORKERS=1")
 endif()
 if (CPPUDDLE_WITH_COUNTERS)
   target_compile_definitions(stream_manager INTERFACE "CPPUDDLE_HAVE_COUNTERS")
diff --git a/include/stream_manager.hpp b/include/stream_manager.hpp
index ec1eeeda..22997e62 100644
--- a/include/stream_manager.hpp
+++ b/include/stream_manager.hpp
@@ -165,7 +165,7 @@ class stream_pool {
 
     /// Multi-GPU init where executors / interfaces on all GPUs are initialized with the same arguments
     template <typename... Ts>
-    static void init_on_all_gpus(size_t number_of_streams, Ts ... executor_args) {
+    static void init_all_executor_pools(size_t number_of_streams, Ts ... executor_args) {
       auto guard = make_scoped_lock_from_array(instance().gpu_mutexes);
       if (number_of_streams > 0) {
         for (size_t gpu_id = 0; gpu_id < max_number_gpus; gpu_id++) {
@@ -179,7 +179,7 @@ class stream_pool {
     /// Per-GPU init allowing for different init parameters depending on the GPU 
     /// (useful for executor that expect an GPU-id during construction)
     template <typename... Ts>
-    static void init_on_gpu(size_t gpu_id, size_t number_of_streams, Ts ... executor_args) {
+    static void init_executor_pool(size_t gpu_id, size_t number_of_streams, Ts ... executor_args) {
       auto guard = make_scoped_lock_from_array(instance().gpu_mutexes);
       if (number_of_streams > 0) {
         instance().select_gpu_function(gpu_id);

From 7f7e9eac65d077e0ad4308b0f97c667c1a92deb6 Mon Sep 17 00:00:00 2001
From: Gregor Daiss <Gregor.Daiss+git@gmail.com>
Date: Mon, 19 Jun 2023 13:40:47 -0500
Subject: [PATCH 12/42] Fix multigpu build

---
 include/aggregation_manager.hpp | 32 +++++++++++++++-------------
 include/buffer_manager.hpp      |  5 ++++-
 include/cuda_buffer_util.hpp    |  4 ++--
 include/stream_manager.hpp      | 37 ++++++++++++++++++++-------------
 4 files changed, 47 insertions(+), 31 deletions(-)

diff --git a/include/aggregation_manager.hpp b/include/aggregation_manager.hpp
index 2682a55c..4bc4dbb7 100644
--- a/include/aggregation_manager.hpp
+++ b/include/aggregation_manager.hpp
@@ -389,6 +389,7 @@ template <typename Executor> class Aggregated_Executor {
   Executor &executor;
 
 public:
+  size_t gpu_id;
   // Subclasses
 
   /// Slice class - meant as a scope interface to the aggregated executor
@@ -559,13 +560,14 @@ template <typename Executor> class Aggregated_Executor {
 
         // Default location -- useful for GPU builds as we otherwise create way too
         // many different buffers for different aggregation sizes on different GPUs
-        size_t location_id = (hpx::get_worker_thread_num() / instances_per_gpu) * instances_per_gpu;
+        size_t location_id = gpu_id * instances_per_gpu;
 #ifdef CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS
         if (max_slices == 1) {
           // get prefered location: aka the current hpx threads location
           // Usually handy for CPU builds where we want to use the buffers
           // close to the current CPU core
-          location_id = (hpx::get_worker_thread_num() / instances_per_gpu) * instances_per_gpu;
+          /* location_id = (hpx::get_worker_thread_num() / instances_per_gpu) * instances_per_gpu; */
+          location_id = (gpu_id) * instances_per_gpu;
           // division makes sure that we always use the same instance to store our gpu buffers.
         }
 #endif
@@ -914,11 +916,11 @@ template <typename Executor> class Aggregated_Executor {
   }
 
   Aggregated_Executor(const size_t number_slices,
-                      Aggregated_Executor_Modes mode)
+                      Aggregated_Executor_Modes mode, const size_t gpu_id)
       : max_slices(number_slices), current_slices(0), slices_exhausted(false),dealloc_counter(0),
-        mode(mode), executor_slices_alive(false), buffers_in_use(false),
+        mode(mode), executor_slices_alive(false), buffers_in_use(false), gpu_id(gpu_id),
         executor_tuple(
-            stream_pool::get_interface<Executor, round_robin_pool<Executor>>()),
+            stream_pool::get_interface<Executor, round_robin_pool<Executor>>(gpu_id)),
         executor(std::get<0>(executor_tuple)),
         current_continuation(hpx::make_ready_future()),
         last_stream_launch_done(hpx::make_ready_future()) {}
@@ -1009,20 +1011,22 @@ class aggregation_pool {
   template <typename... Ts>
   static void init(size_t number_of_executors, size_t slices_per_executor,
                    Aggregated_Executor_Modes mode) {
-    const size_t gpu_id = get_device_id();
-    std::lock_guard<aggregation_mutex_t> guard(instance()[gpu_id].pool_mutex);
-    assert(instance()[gpu_id].aggregation_executor_pool.empty());
-    for (int i = 0; i < number_of_executors; i++) {
-      instance()[gpu_id].aggregation_executor_pool.emplace_back(slices_per_executor,
-                                                      mode);
+    for (size_t gpu_id = 0; gpu_id < max_number_gpus; gpu_id++) {
+      std::lock_guard<aggregation_mutex_t> guard(instance()[gpu_id].pool_mutex);
+      assert(instance()[gpu_id].aggregation_executor_pool.empty());
+      for (int i = 0; i < number_of_executors; i++) {
+        instance()[gpu_id].aggregation_executor_pool.emplace_back(slices_per_executor,
+                                                        mode, gpu_id);
+      }
+      instance()[gpu_id].slices_per_executor = slices_per_executor;
+      instance()[gpu_id].mode = mode;
     }
-    instance()[gpu_id].slices_per_executor = slices_per_executor;
-    instance()[gpu_id].mode = mode;
   }
 
   /// Will always return a valid executor slice
   static decltype(auto) request_executor_slice(void) {
     const size_t gpu_id = get_device_id();
+    /* const size_t gpu_id = 1; */
     std::lock_guard<aggregation_mutex_t> guard(instance()[gpu_id].pool_mutex);
     assert(!instance()[gpu_id].aggregation_executor_pool.empty());
     std::optional<hpx::lcos::future<
@@ -1052,7 +1056,7 @@ class aggregation_pool {
     // return empty optional
     if (instance()[gpu_id].growing_pool) {
       instance()[gpu_id].aggregation_executor_pool.emplace_back(
-          instance()[gpu_id].slices_per_executor, instance()[gpu_id].mode);
+          instance()[gpu_id].slices_per_executor, instance()[gpu_id].mode, gpu_id);
       instance()[gpu_id].current_interface =
           instance()[gpu_id].aggregation_executor_pool.size() - 1;
       assert(instance()[gpu_id].aggregation_executor_pool.size() < 20480);
diff --git a/include/buffer_manager.hpp b/include/buffer_manager.hpp
index 5325bffb..61254efe 100644
--- a/include/buffer_manager.hpp
+++ b/include/buffer_manager.hpp
@@ -254,7 +254,8 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL
 
       // No unused buffer found -> Create new one and return it
       try {
-        recycler::device_selection::select_device_functor<T, Host_Allocator>{}(location_id / number_instances); 
+        recycler::device_selection::select_device_functor<T, Host_Allocator>{}(
+            location_id / instances_per_gpu);
         Host_Allocator alloc;
         T *buffer = alloc.allocate(number_of_elements);
         instance()[location_id].buffer_map.insert(
@@ -278,6 +279,8 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL
         // If there still isn't enough memory left, the caller has to handle it
         // We've done all we can in here
         Host_Allocator alloc;
+        recycler::device_selection::select_device_functor<T, Host_Allocator>{}(
+            location_id / instances_per_gpu);
         T *buffer = alloc.allocate(number_of_elements);
         instance()[location_id].buffer_map.insert(
             {buffer, std::make_tuple(buffer, number_of_elements, 1,
diff --git a/include/cuda_buffer_util.hpp b/include/cuda_buffer_util.hpp
index 431876f2..8fb57d4f 100644
--- a/include/cuda_buffer_util.hpp
+++ b/include/cuda_buffer_util.hpp
@@ -175,11 +175,11 @@ struct cuda_aggregated_device_buffer {
 namespace device_selection {
 template <typename T>
 struct select_device_functor<T, detail::cuda_pinned_allocator<T>> {
-  void operator()(const size_t device_id) { cudaSetDevice(get_device_id()); }
+  void operator()(const size_t device_id) { cudaSetDevice(device_id); }
 };
 template <typename T>
 struct select_device_functor<T, detail::cuda_device_allocator<T>> {
-  void operator()(const size_t device_id) { cudaSetDevice(get_device_id()); }
+  void operator()(const size_t device_id) { cudaSetDevice(device_id); }
 };
 } // namespace device_selection
 
diff --git a/include/stream_manager.hpp b/include/stream_manager.hpp
index 22997e62..38f2f31d 100644
--- a/include/stream_manager.hpp
+++ b/include/stream_manager.hpp
@@ -44,6 +44,7 @@ template <class Interface> class round_robin_pool {
   }
   // return a tuple with the interface and its index (to release it later)
   std::tuple<Interface &, size_t> get_interface() {
+    assert(!(pool.empty())); 
     size_t last_interface = current_interface;
     current_interface = (current_interface + 1) % pool.size();
     ref_counters[last_interface]++;
@@ -114,18 +115,28 @@ class stream_pool {
   static void init(size_t number_of_streams, Ts ... executor_args) {
     stream_pool_implementation<Interface, Pool>::init(number_of_streams,
                                                       executor_args...);
-}
+  }
+  template <class Interface, class Pool, typename... Ts>
+  static void init_all_executor_pools(size_t number_of_streams, Ts ... executor_args) {
+    stream_pool_implementation<Interface, Pool>::init_all_executor_pools(number_of_streams,
+                                                      executor_args...);
+  }
+  template <class Interface, class Pool, typename... Ts>
+  static void init_executor_pool(size_t pool_id, size_t number_of_streams, Ts ... executor_args) {
+    stream_pool_implementation<Interface, Pool>::init_executor_pool(pool_id, number_of_streams,
+                                                      executor_args...);
+  }
   template <class Interface, class Pool> static void cleanup() {
     stream_pool_implementation<Interface, Pool>::cleanup();
   }
   template <class Interface, class Pool>
-  static std::tuple<Interface &, size_t> get_interface() {
-    return stream_pool_implementation<Interface, Pool>::get_interface(get_device_id());
+  static std::tuple<Interface &, size_t> get_interface(const size_t gpu_id = get_device_id()) {
+    return stream_pool_implementation<Interface, Pool>::get_interface(gpu_id);
   }
   template <class Interface, class Pool>
-  static void release_interface(size_t index) noexcept {
+  static void release_interface(size_t index, const size_t gpu_id = get_device_id()) noexcept {
     stream_pool_implementation<Interface, Pool>::release_interface(index,
-        get_device_id());
+        gpu_id);
   }
   template <class Interface, class Pool>
   static bool interface_available(size_t load_limit) noexcept {
@@ -144,8 +155,8 @@ class stream_pool {
   }
 
   template <class Interface, class Pool>
-  static size_t set_device_selector(std::function<void(size_t)> select_gpu_function) {
-    return stream_pool_implementation<Interface, Pool>::set_device_selector(select_gpu_function);
+  static void set_device_selector(std::function<void(size_t)> select_gpu_function) {
+    stream_pool_implementation<Interface, Pool>::set_device_selector(select_gpu_function);
   }
 
 private:
@@ -157,10 +168,10 @@ class stream_pool {
     /// Deprecated! Use init_on_all_gpu or init_on_gpu
     template <typename... Ts>
     static void init(size_t number_of_streams, Ts ... executor_args) {
-      static_assert(max_number_gpus == 1, "deprecated stream_pool::init does not support multigpu");
+      /* static_assert(sizeof...(Ts) == sizeof...(Ts) && max_number_gpus == 1, */
+      /*               "deprecated stream_pool::init does not support multigpu"); */
       auto guard = make_scoped_lock_from_array(instance().gpu_mutexes);
-          instance().streampools.emplace_back(number_of_streams,
-                                              executor_args...);
+      instance().streampools.emplace_back(number_of_streams, executor_args...);
     }
 
     /// Multi-GPU init where executors / interfaces on all GPUs are initialized with the same arguments
@@ -222,10 +233,10 @@ class stream_pool {
       return instance().streampools[gpu_id].get_next_device_id();
     }
 
-    static size_t set_device_selector(std::function<void(size_t)> select_gpu_function) {
+    static void set_device_selector(std::function<void(size_t)> select_gpu_function) {
       auto guard = make_scoped_lock_from_array(instance().gpu_mutexes);
       assert(instance().streampools.size() == max_number_gpus);
-      return instance().select_gpu_function = select_gpu_function;
+      instance().select_gpu_function = select_gpu_function;
     }
 
   private:
@@ -290,8 +301,6 @@ template <class Interface, class Pool> class stream_interface {
     return interface.async_execute(std::forward<F>(f), std::forward<Ts>(ts)...);
   }
 
-  inline size_t get_gpu_id() noexcept { return interface.get_gpu_id(); }
-
   // allow implict conversion
   operator Interface &() { // NOLINT
     return interface;

From c39ee74ea6db2f220ecffc26b4aa9045cae2fee3 Mon Sep 17 00:00:00 2001
From: Gregor Daiss <Gregor.Daiss+git@gmail.com>
Date: Tue, 20 Jun 2023 11:29:02 -0500
Subject: [PATCH 13/42] Use spinlocks without backoff

---
 include/detail/config.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/detail/config.hpp b/include/detail/config.hpp
index 4c8eca37..f3717411 100644
--- a/include/detail/config.hpp
+++ b/include/detail/config.hpp
@@ -10,8 +10,8 @@
 //
 #if defined(CPPUDDLE_HAVE_HPX) && defined(CPPUDDLE_HAVE_HPX_MUTEX)
 #include <hpx/mutex.hpp>
-using mutex_t = hpx::spinlock;
-using aggregation_mutex_t = hpx::spinlock;
+using mutex_t = hpx::spinlock_no_backoff;
+using aggregation_mutex_t = hpx::spinlock_no_backoff;
 #else
 #include <mutex>
 using mutex_t = std::mutex;

From 93f2d6060f6e00003de3b053c6004fef841b6e16 Mon Sep 17 00:00:00 2001
From: Gregor Daiss <Gregor.Daiss+git@gmail.com>
Date: Thu, 22 Jun 2023 22:04:25 -0500
Subject: [PATCH 14/42] Add select gpu function

---
 include/aggregation_manager.hpp | 1 +
 include/stream_manager.hpp      | 9 +++++++++
 2 files changed, 10 insertions(+)

diff --git a/include/aggregation_manager.hpp b/include/aggregation_manager.hpp
index 4bc4dbb7..0269b527 100644
--- a/include/aggregation_manager.hpp
+++ b/include/aggregation_manager.hpp
@@ -816,6 +816,7 @@ template <typename Executor> class Aggregated_Executor {
           // Fallback launch condidtion: Launch as soon as the underlying stream
           // is ready
           /* auto slices_full_fut = slices_full_promise.get_future(); */
+          stream_pool::select_device<Executor, round_robin_pool<Executor>>(gpu_id);
           auto exec_fut = executor.get_future(); 
           /* fut = hpx::when_any(exec_fut, slices_full_fut); */
           fut = std::move(exec_fut);
diff --git a/include/stream_manager.hpp b/include/stream_manager.hpp
index 38f2f31d..a10229a2 100644
--- a/include/stream_manager.hpp
+++ b/include/stream_manager.hpp
@@ -159,6 +159,11 @@ class stream_pool {
     stream_pool_implementation<Interface, Pool>::set_device_selector(select_gpu_function);
   }
 
+  template <class Interface, class Pool>
+  static void select_device(size_t gpu_id) {
+    stream_pool_implementation<Interface, Pool>::select_device(gpu_id);
+  }
+
 private:
   stream_pool() = default;
 
@@ -239,6 +244,10 @@ class stream_pool {
       instance().select_gpu_function = select_gpu_function;
     }
 
+    static void select_device(size_t gpu_id) {
+      instance().select_gpu_function(gpu_id);
+    }
+
   private:
     stream_pool_implementation() = default;
     mutex_t pool_mut{};

From 1dd0d5260799c805006949ce87f3dd2045d6db18 Mon Sep 17 00:00:00 2001
From: Gregor Daiss <Gregor.Daiss+git@gmail.com>
Date: Tue, 27 Jun 2023 10:48:11 -0500
Subject: [PATCH 15/42] Use double checked locking for aggregation again

---
 include/aggregation_manager.hpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/include/aggregation_manager.hpp b/include/aggregation_manager.hpp
index 0269b527..57d39f15 100644
--- a/include/aggregation_manager.hpp
+++ b/include/aggregation_manager.hpp
@@ -675,11 +675,11 @@ template <typename Executor> class Aggregated_Executor {
 
   /// Only meant to be accessed by the slice executors
   bool sync_aggregation_slices(const size_t slice_launch_counter) {
-    std::lock_guard<aggregation_mutex_t> guard(mut);
+    /* std::lock_guard<aggregation_mutex_t> guard(mut); */
     assert(slices_exhausted == true);
     // Add function call object in case it hasn't happened for this launch yet
     if (overall_launch_counter <= slice_launch_counter) {
-      /* std::lock_guard<aggregation_mutex_t> guard(mut); */
+      std::lock_guard<aggregation_mutex_t> guard(mut);
       if (overall_launch_counter <= slice_launch_counter) {
         function_calls.emplace_back(current_slices, false, executor);
         overall_launch_counter = function_calls.size();
@@ -695,11 +695,11 @@ template <typename Executor> class Aggregated_Executor {
   /// Only meant to be accessed by the slice executors
   template <typename F, typename... Ts>
   void post(const size_t slice_launch_counter, F &&f, Ts &&...ts) {
-    std::lock_guard<aggregation_mutex_t> guard(mut);
+    /* std::lock_guard<aggregation_mutex_t> guard(mut); */
     assert(slices_exhausted == true);
     // Add function call object in case it hasn't happened for this launch yet
     if (overall_launch_counter <= slice_launch_counter) {
-      /* std::lock_guard<aggregation_mutex_t> guard(mut); */
+      std::lock_guard<aggregation_mutex_t> guard(mut);
       if (overall_launch_counter <= slice_launch_counter) {
         function_calls.emplace_back(current_slices, false, executor);
         overall_launch_counter = function_calls.size();
@@ -718,11 +718,11 @@ template <typename Executor> class Aggregated_Executor {
   template <typename F, typename... Ts>
   hpx::lcos::future<void> async(const size_t slice_launch_counter, F &&f,
                                 Ts &&...ts) {
-    std::lock_guard<aggregation_mutex_t> guard(mut);
+    /* std::lock_guard<aggregation_mutex_t> guard(mut); */
     assert(slices_exhausted == true);
     // Add function call object in case it hasn't happened for this launch yet
     if (overall_launch_counter <= slice_launch_counter) {
-      /* std::lock_guard<aggregation_mutex_t> guard(mut); */
+      std::lock_guard<aggregation_mutex_t> guard(mut);
       if (overall_launch_counter <= slice_launch_counter) {
         function_calls.emplace_back(current_slices, true, executor);
         overall_launch_counter = function_calls.size();
@@ -738,11 +738,11 @@ template <typename Executor> class Aggregated_Executor {
   template <typename F, typename... Ts>
   hpx::lcos::shared_future<void> wrap_async(const size_t slice_launch_counter, F &&f,
                                 Ts &&...ts) {
-    std::lock_guard<aggregation_mutex_t> guard(mut);
+    /* std::lock_guard<aggregation_mutex_t> guard(mut); */
     assert(slices_exhausted == true);
     // Add function call object in case it hasn't happened for this launch yet
     if (overall_launch_counter <= slice_launch_counter) {
-      /* std::lock_guard<aggregation_mutex_t> guard(mut); */
+      std::lock_guard<aggregation_mutex_t> guard(mut);
       if (overall_launch_counter <= slice_launch_counter) {
         function_calls.emplace_back(current_slices, true, executor);
         overall_launch_counter = function_calls.size();

From cdb6f5129d8c555a8ed5f8cf25dab24a8e55f942 Mon Sep 17 00:00:00 2001
From: Gregor Daiss <Gregor.Daiss+git@gmail.com>
Date: Tue, 27 Jun 2023 10:48:27 -0500
Subject: [PATCH 16/42] Increase deadlock test iterations again

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5b6bd617..fb38b230 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -33,7 +33,7 @@ set(CPPUDDLE_WITH_MAX_NUMBER_WORKERS "128" CACHE STRING "Max number of workers t
 # Test-related options
 option(CPPUDDLE_WITH_COUNTERS "Turns on allocations counters. Useful for extended testing" OFF)
 option(CPPUDDLE_WITH_TESTS "Build tests/examples" OFF)
-set(CPPUDDLE_WITH_DEADLOCK_TEST_REPETITONS "10000" CACHE STRING "Number of repetitions for the aggregation executor deadlock tests")
+set(CPPUDDLE_WITH_DEADLOCK_TEST_REPETITONS "100000" CACHE STRING "Number of repetitions for the aggregation executor deadlock tests")
 option(CPPUDDLE_DEACTIVATE_BUFFER_RECYCLING "Deactivates the default recycling behaviour" OFF)
 option(CPPUDDLE_DEACTIVATE_AGGRESSIVE_ALLOCATORS "Deactivates the aggressive allocators" OFF)
 # Tooling options

From 33be329cfee44be18b45b0b268ef60104cb0233d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gregor=20Dai=C3=9F?= <Gregor.Daiss+git@gmail.com>
Date: Tue, 27 Jun 2023 14:34:37 -0500
Subject: [PATCH 17/42] Revert "Use double checked locking for aggregation
 again"

This reverts commit 1dd0d5260799c805006949ce87f3dd2045d6db18.

Caused trouble with the deque access outside of the mutex-protected area
when adding anouther call to the deque I think.
---
 include/aggregation_manager.hpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/include/aggregation_manager.hpp b/include/aggregation_manager.hpp
index 57d39f15..0269b527 100644
--- a/include/aggregation_manager.hpp
+++ b/include/aggregation_manager.hpp
@@ -675,11 +675,11 @@ template <typename Executor> class Aggregated_Executor {
 
   /// Only meant to be accessed by the slice executors
   bool sync_aggregation_slices(const size_t slice_launch_counter) {
-    /* std::lock_guard<aggregation_mutex_t> guard(mut); */
+    std::lock_guard<aggregation_mutex_t> guard(mut);
     assert(slices_exhausted == true);
     // Add function call object in case it hasn't happened for this launch yet
     if (overall_launch_counter <= slice_launch_counter) {
-      std::lock_guard<aggregation_mutex_t> guard(mut);
+      /* std::lock_guard<aggregation_mutex_t> guard(mut); */
       if (overall_launch_counter <= slice_launch_counter) {
         function_calls.emplace_back(current_slices, false, executor);
         overall_launch_counter = function_calls.size();
@@ -695,11 +695,11 @@ template <typename Executor> class Aggregated_Executor {
   /// Only meant to be accessed by the slice executors
   template <typename F, typename... Ts>
   void post(const size_t slice_launch_counter, F &&f, Ts &&...ts) {
-    /* std::lock_guard<aggregation_mutex_t> guard(mut); */
+    std::lock_guard<aggregation_mutex_t> guard(mut);
     assert(slices_exhausted == true);
     // Add function call object in case it hasn't happened for this launch yet
     if (overall_launch_counter <= slice_launch_counter) {
-      std::lock_guard<aggregation_mutex_t> guard(mut);
+      /* std::lock_guard<aggregation_mutex_t> guard(mut); */
       if (overall_launch_counter <= slice_launch_counter) {
         function_calls.emplace_back(current_slices, false, executor);
         overall_launch_counter = function_calls.size();
@@ -718,11 +718,11 @@ template <typename Executor> class Aggregated_Executor {
   template <typename F, typename... Ts>
   hpx::lcos::future<void> async(const size_t slice_launch_counter, F &&f,
                                 Ts &&...ts) {
-    /* std::lock_guard<aggregation_mutex_t> guard(mut); */
+    std::lock_guard<aggregation_mutex_t> guard(mut);
     assert(slices_exhausted == true);
     // Add function call object in case it hasn't happened for this launch yet
     if (overall_launch_counter <= slice_launch_counter) {
-      std::lock_guard<aggregation_mutex_t> guard(mut);
+      /* std::lock_guard<aggregation_mutex_t> guard(mut); */
       if (overall_launch_counter <= slice_launch_counter) {
         function_calls.emplace_back(current_slices, true, executor);
         overall_launch_counter = function_calls.size();
@@ -738,11 +738,11 @@ template <typename Executor> class Aggregated_Executor {
   template <typename F, typename... Ts>
   hpx::lcos::shared_future<void> wrap_async(const size_t slice_launch_counter, F &&f,
                                 Ts &&...ts) {
-    /* std::lock_guard<aggregation_mutex_t> guard(mut); */
+    std::lock_guard<aggregation_mutex_t> guard(mut);
     assert(slices_exhausted == true);
     // Add function call object in case it hasn't happened for this launch yet
     if (overall_launch_counter <= slice_launch_counter) {
-      std::lock_guard<aggregation_mutex_t> guard(mut);
+      /* std::lock_guard<aggregation_mutex_t> guard(mut); */
       if (overall_launch_counter <= slice_launch_counter) {
         function_calls.emplace_back(current_slices, true, executor);
         overall_launch_counter = function_calls.size();

From eb242180c534ffe23b1667c96fb93a37d6d62714 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gregor=20Dai=C3=9F?= <Gregor.Daiss+git@gmail.com>
Date: Tue, 27 Jun 2023 15:14:42 -0500
Subject: [PATCH 18/42] Add default id

---
 include/aggregation_manager.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/aggregation_manager.hpp b/include/aggregation_manager.hpp
index 0269b527..e687e106 100644
--- a/include/aggregation_manager.hpp
+++ b/include/aggregation_manager.hpp
@@ -917,7 +917,7 @@ template <typename Executor> class Aggregated_Executor {
   }
 
   Aggregated_Executor(const size_t number_slices,
-                      Aggregated_Executor_Modes mode, const size_t gpu_id)
+                      Aggregated_Executor_Modes mode, const size_t gpu_id = 0)
       : max_slices(number_slices), current_slices(0), slices_exhausted(false),dealloc_counter(0),
         mode(mode), executor_slices_alive(false), buffers_in_use(false), gpu_id(gpu_id),
         executor_tuple(

From bc1edeee940ca40cada14f5143156536581ff6e1 Mon Sep 17 00:00:00 2001
From: Gregor Daiss <Gregor.Daiss+git@gmail.com>
Date: Wed, 28 Jun 2023 13:01:24 -0500
Subject: [PATCH 19/42] Fix mutex type for aggregation

---
 include/detail/config.hpp             | 3 ++-
 tests/work_aggregation_cuda_triad.cpp | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/detail/config.hpp b/include/detail/config.hpp
index f3717411..0244ae9b 100644
--- a/include/detail/config.hpp
+++ b/include/detail/config.hpp
@@ -11,7 +11,7 @@
 #if defined(CPPUDDLE_HAVE_HPX) && defined(CPPUDDLE_HAVE_HPX_MUTEX)
 #include <hpx/mutex.hpp>
 using mutex_t = hpx::spinlock_no_backoff;
-using aggregation_mutex_t = hpx::spinlock_no_backoff;
+using aggregation_mutex_t = hpx::mutex;
 #else
 #include <mutex>
 using mutex_t = std::mutex;
@@ -43,6 +43,7 @@ For better performance configure CPPuddle with CPPUDDLE_WITH_HPX_AWARE_ALLOCATOR
 // Thread and MultiGPU configuration
 //
 constexpr size_t number_instances = CPPUDDLE_MAX_NUMBER_WORKERS;
+static_assert(number_instances >= 1);
 constexpr size_t max_number_gpus = CPPUDDLE_MAX_NUMBER_GPUS;
 #ifndef CPPUDDLE_HAVE_HPX
 static_assert(max_number_gpus == 1, "Non HPX builds do not support multigpu");
diff --git a/tests/work_aggregation_cuda_triad.cpp b/tests/work_aggregation_cuda_triad.cpp
index 4bc050b1..2ded567a 100644
--- a/tests/work_aggregation_cuda_triad.cpp
+++ b/tests/work_aggregation_cuda_triad.cpp
@@ -28,6 +28,7 @@ __global__ void __launch_bounds__(1024, 2) triad_kernel(float_t *A, const float_
 //===============================================================================
 //===============================================================================
 int hpx_main(int argc, char *argv[]) {
+  static_assert(max_number_gpus == 1, "This test currently does not support MultiGPU builds!");
   // Init parameters
   size_t problem_size{0};
   size_t kernel_size{0};

From 7dcab62782e037121e86fadb8e1d3d2694e1e3e4 Mon Sep 17 00:00:00 2001
From: Gregor Daiss <Gregor.Daiss+git@gmail.com>
Date: Wed, 28 Jun 2023 15:33:55 -0500
Subject: [PATCH 20/42] Fix kokkos test

---
 tests/allocator_kokkos_test.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/allocator_kokkos_test.cpp b/tests/allocator_kokkos_test.cpp
index e0c7fb55..6109f71d 100644
--- a/tests/allocator_kokkos_test.cpp
+++ b/tests/allocator_kokkos_test.cpp
@@ -81,7 +81,7 @@ int main(int argc, char *argv[]) {
   for (size_t pass = 0; pass < passes; pass++) {
     test_view my_wrapper_test1(1000);
     test_view my_wrapper_test2(1000);
-    test_view my_wrapper_test3(127, 1000); // test 1D with location id 
+    test_view my_wrapper_test3(number_instances - 1, 1000); // test 1D with location id 
     double t = 2.6;
     Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::Serial>(0, 1000),
                         KOKKOS_LAMBDA(const int n) {

From db58db97cd5e74f5669e368a87bf5af574c9b0b5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gregor=20Dai=C3=9F?= <Gregor.Daiss+git@gmail.com>
Date: Mon, 3 Jul 2023 15:16:40 -0500
Subject: [PATCH 21/42] Check for invalid location ids

---
 include/buffer_manager.hpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/include/buffer_manager.hpp b/include/buffer_manager.hpp
index 61254efe..1e434296 100644
--- a/include/buffer_manager.hpp
+++ b/include/buffer_manager.hpp
@@ -221,6 +221,9 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL
       if (location_hint) {
         location_id = location_hint.value();
       }
+      if (location_id >= number_instances) {
+        throw std::runtime_error("Tried to create buffer with invalid location_id [get]");
+      }
       std::lock_guard<mutex_t> guard(instance()[location_id].mut);
 
 
@@ -305,6 +308,10 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL
 
       if (location_hint) {
         size_t location_id = location_hint.value();
+        if (location_id >= number_instances) {
+          throw std::runtime_error(
+              "Buffer recylcer received invalid location hint [mark_unused]");
+        }
         std::lock_guard<mutex_t> guard(instance()[location_id].mut);
         if (instance()[location_id].buffer_map.find(memory_location) !=
             instance()[location_id].buffer_map.end()) {

From 073f4dad2edb1138b6c0ef8e6583d17a0b48ca22 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gregor=20Dai=C3=9F?= <Gregor.Daiss+git@gmail.com>
Date: Sat, 12 Aug 2023 15:55:22 -0500
Subject: [PATCH 22/42] Add hpx performance counters

---
 include/buffer_manager.hpp                    | 183 +++++++++++++++---
 tests/allocator_aligned_test.cpp              |   3 +
 tests/allocator_hpx_test.cpp                  |   2 +
 ...llocator_kokkos_executor_for_loop_test.cpp |   1 +
 tests/allocator_kokkos_test.cpp               |   1 +
 tests/allocator_test.cpp                      |   3 +
 tests/work_aggregation_test.cpp               |   1 +
 7 files changed, 167 insertions(+), 27 deletions(-)

diff --git a/include/buffer_manager.hpp b/include/buffer_manager.hpp
index 1e434296..f17cc725 100644
--- a/include/buffer_manager.hpp
+++ b/include/buffer_manager.hpp
@@ -37,6 +37,9 @@ For better performance configure CPPuddle with CPPUDDLE_WITH_HPX_AWARE_ALLOCATOR
 
 #ifdef CPPUDDLE_HAVE_COUNTERS
 #include <boost/core/demangle.hpp>
+#if defined(CPPUDDLE_HAVE_HPX)
+#include <hpx/include/performance_counters.hpp>
+#endif
 #endif
 
 #include "../include/detail/config.hpp"
@@ -96,6 +99,17 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL
     return buffer_manager<T, Host_Allocator>::mark_unused(p, number_elements);
   }
 #endif
+  template <typename T, typename Host_Allocator>
+    static void register_allocator_counters_with_hpx(void) {
+#ifdef CPPUDDLE_HAVE_COUNTERS
+      buffer_manager<T, Host_Allocator>::register_counters_with_hpx();
+#else
+    std::cerr << "Warning: Trying to register allocator performance counters with HPX but CPPuddle was built "
+                 "without CPPUDDLE_WITH_COUNTERS -- operation will be ignored!"
+              << std::endl;
+#endif
+    }
+
   /// Deallocate all buffers, no matter whether they are marked as used or not
   static void clean_all() {
     std::lock_guard<mutex_t> guard(instance().callback_protection_mut);
@@ -121,6 +135,20 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL
     }
   }
 
+  static void print_performance_counters() {
+#ifdef CPPUDDLE_HAVE_COUNTERS
+    std::lock_guard<mutex_t> guard(instance().callback_protection_mut);
+    for (const auto &print_function :
+         instance().print_callbacks) {
+      print_function();
+    }
+#else
+    std::cerr << "Warning: Trying to print allocator performance counters but CPPuddle was built "
+                 "without CPPUDDLE_WITH_COUNTERS -- operation will be ignored!"
+              << std::endl;
+#endif
+  }
+
   // Member variables and methods
 private:
 
@@ -129,6 +157,8 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL
     static buffer_recycler singleton{};
     return singleton;
   }
+  /// Callbacks for printing the performance counter data
+  std::list<std::function<void()>> print_callbacks;
   /// Callbacks for buffer_manager finalize - each callback completely destroys
   /// one buffer_manager
   std::list<std::function<void()>> finalize_callbacks;
@@ -160,6 +190,12 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL
     std::lock_guard<mutex_t> guard(instance().callback_protection_mut);
     instance().finalize_callbacks.push_back(func);
   }
+  /// Add a callback function that gets executed upon partial (unused memory)
+  /// cleanup
+  static void add_print_callback(const std::function<void()> &func) {
+    std::lock_guard<mutex_t> guard(instance().callback_protection_mut);
+    instance().print_callbacks.push_back(func);
+  }
 
 public:
   ~buffer_recycler() = default; 
@@ -174,6 +210,7 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL
     // well
     using buffer_entry_type = std::tuple<T *, size_t, size_t, bool>;
 
+
   public:
     /// Cleanup and delete this singleton
     static void clean() {
@@ -183,6 +220,13 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL
         instance()[i].clean_all_buffers();
       }
     }
+    static void print_performance_counters() {
+      assert(instance() && !is_finalized);
+      for (auto i = 0; i < number_instances; i++) {
+        std::lock_guard<mutex_t> guard(instance()[i].mut);
+        instance()[i].print_counters();
+      }
+    }
     static void finalize() {
       assert(instance() && !is_finalized);
       is_finalized = true;
@@ -207,6 +251,71 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL
         instance()[i].unused_buffer_list.clear();
       }
     }
+#if defined(CPPUDDLE_HAVE_COUNTERS) && defined(CPPUDDLE_HAVE_HPX)
+    static size_t get_sum_number_recycling(bool reset) {
+      if (reset)
+        sum_number_recycling = 0;
+      return sum_number_recycling;
+    }
+    static size_t get_sum_number_allocation(bool reset) {
+      if (reset)
+        sum_number_allocation = 0;
+      return sum_number_allocation;
+    }
+    static size_t get_sum_number_creation(bool reset) {
+      if (reset)
+        sum_number_creation = 0;
+      return sum_number_creation;
+    }
+    static size_t get_sum_number_deallocation(bool reset) {
+      if (reset)
+        sum_number_deallocation = 0;
+      return sum_number_deallocation;
+    }
+    static size_t get_sum_number_wrong_hints(bool reset) {
+      if (reset)
+        sum_number_wrong_hints = 0;
+      return sum_number_wrong_hints;
+    }
+    static size_t get_sum_number_bad_allocs(bool reset) {
+      if (reset)
+        sum_number_bad_allocs = 0;
+      return sum_number_bad_allocs;
+    }
+
+    static void register_counters_with_hpx(void) {
+      std::string alloc_name =
+          boost::core::demangle(typeid(Host_Allocator).name()) +
+          std::string("_") + boost::core::demangle(typeid(T).name());
+      hpx::performance_counters::install_counter_type(
+          std::string("/cppuddle/allocators/") + alloc_name + std::string("/number_recycling/"),
+          &get_sum_number_recycling,
+          "Number of allocations using a recycled buffer with this "
+          "allocator");
+      hpx::performance_counters::install_counter_type(
+          std::string("/cppuddle/allocators/") + alloc_name + std::string("/number_allocations/"),
+          &get_sum_number_allocation,
+          "Number of allocations with this allocator");
+      hpx::performance_counters::install_counter_type(
+          std::string("/cppuddle/allocators/") + alloc_name + std::string("/number_creations/"),
+          &get_sum_number_creation,
+          "Number of allocations not using a recycled buffer with this "
+          "allocator");
+      hpx::performance_counters::install_counter_type(
+          std::string("/cppuddle/allocators/") + alloc_name + std::string("/number_deallocations/"),
+          &get_sum_number_deallocation,
+          "Number of deallocations yielding buffers to be recycled with this "
+          "allocator");
+      hpx::performance_counters::install_counter_type(
+          std::string("/cppuddle/allocators/") + alloc_name + std::string("/number_wrong_hints/"),
+          &get_sum_number_wrong_hints,
+          "Number of wrong hints supplied to the dealloc method with this allocator");
+      hpx::performance_counters::install_counter_type(
+          std::string("/cppuddle/allocators/") + alloc_name + std::string("/number_bad_allocs/"),
+          &get_sum_number_bad_allocs,
+          "Number of wrong bad allocs which triggered a cleanup of unused buffers");
+    }
+#endif
 
     /// Tries to recycle or create a buffer of type T and size number_elements.
     static T *get(size_t number_of_elements, bool manage_content_lifetime,
@@ -229,6 +338,7 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL
 
 #ifdef CPPUDDLE_HAVE_COUNTERS
       instance()[location_id].number_allocation++;
+      sum_number_allocation++;
 #endif
       // Check for unused buffers we can recycle:
       for (auto iter = instance()[location_id].unused_buffer_list.begin();
@@ -250,6 +360,7 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL
           instance()[location_id].buffer_map.insert({std::get<0>(tuple), tuple});
 #ifdef CPPUDDLE_HAVE_COUNTERS
           instance()[location_id].number_recycling++;
+          sum_number_recycling++;
 #endif
           return std::get<0>(tuple);
         }
@@ -266,6 +377,7 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL
                                      manage_content_lifetime)});
 #ifdef CPPUDDLE_HAVE_COUNTERS
         instance()[location_id].number_creation++;
+        sum_number_creation++;
 #endif
         if (manage_content_lifetime) {
           std::uninitialized_value_construct_n(buffer, number_of_elements);
@@ -290,7 +402,9 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL
                                      manage_content_lifetime)});
 #ifdef CPPUDDLE_HAVE_COUNTERS
         instance()[location_id].number_creation++;
+        sum_number_creation++;
         instance()[location_id].number_bad_alloc++;
+        sum_number_bad_allocs++;
 #endif
         std::cerr << "Second attempt allocation successful!" << std::endl;
         if (manage_content_lifetime) {
@@ -316,7 +430,8 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL
         if (instance()[location_id].buffer_map.find(memory_location) !=
             instance()[location_id].buffer_map.end()) {
 #ifdef CPPUDDLE_HAVE_COUNTERS
-          instance()[location_id].number_dealloacation++;
+          instance()[location_id].number_deallocation++;
+          sum_number_deallocation++;
 #endif
           auto it = instance()[location_id].buffer_map.find(memory_location);
           assert(it != instance()[location_id].buffer_map.end());
@@ -332,6 +447,7 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL
         // managers
 #ifdef CPPUDDLE_HAVE_COUNTERS
         instance()[location_id].number_wrong_hints++;
+        sum_number_wrong_hints++;
 #endif
       }
 
@@ -345,7 +461,8 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL
         if (instance()[location_id].buffer_map.find(memory_location) !=
             instance()[location_id].buffer_map.end()) {
 #ifdef CPPUDDLE_HAVE_COUNTERS
-          instance()[location_id].number_dealloacation++;
+          instance()[location_id].number_deallocation++;
+          sum_number_deallocation++;
 #endif
           auto it = instance()[location_id].buffer_map.find(memory_location);
           assert(it != instance()[location_id].buffer_map.end());
@@ -382,8 +499,13 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL
     mutex_t mut;
 #ifdef CPPUDDLE_HAVE_COUNTERS
     /// Performance counters
-    size_t number_allocation{0}, number_dealloacation{0}, number_wrong_hints{0};
+    size_t number_allocation{0}, number_deallocation{0}, number_wrong_hints{0};
     size_t number_recycling{0}, number_creation{0}, number_bad_alloc{0};
+
+    static inline std::atomic<size_t> sum_number_allocation{0},
+        sum_number_deallocation{0}, sum_number_wrong_hints{0};
+    static inline std::atomic<size_t> sum_number_recycling{0},
+        sum_number_creation{0}, sum_number_bad_allocs{0};
 #endif
     /// default, private constructor - not automatically constructed due to the
     /// deleted constructors
@@ -412,35 +534,16 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL
             clean_unused_buffers_only);
         buffer_recycler::add_finalize_callback(
             finalize);
+#ifdef CPPUDDLE_HAVE_COUNTERS
+        buffer_recycler::add_print_callback(
+            print_performance_counters);
+#endif
           });
     }
     static inline std::atomic<bool> is_finalized;
 
-
-    void clean_all_buffers(void) {
-#ifdef CPPUDDLE_HAVE_COUNTERS
-      if (number_allocation == 0 && number_recycling == 0 &&
-          number_bad_alloc == 0 && number_creation == 0 &&
-          unused_buffer_list.empty() && buffer_map.empty()) {
-        return;
-      }
-#endif
-      for (auto &buffer_tuple : unused_buffer_list) {
-        Host_Allocator alloc;
-        if (std::get<3>(buffer_tuple)) {
-          std::destroy_n(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple));
-        }
-        alloc.deallocate(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple));
-      }
-      for (auto &map_tuple : buffer_map) {
-        auto buffer_tuple = map_tuple.second;
-        Host_Allocator alloc;
-        if (std::get<3>(buffer_tuple)) {
-          std::destroy_n(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple));
-        }
-        alloc.deallocate(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple));
-      }
 #ifdef CPPUDDLE_HAVE_COUNTERS
+    void print_counters(void) {
       // Print performance counters
       size_t number_cleaned = unused_buffer_list.size() + buffer_map.size();
       std::cout << "\nBuffer manager destructor for (Alloc: "
@@ -475,7 +578,32 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL
                 << static_cast<float>(number_recycling) / number_allocation *
                        100.0f
                 << "%" << std::endl;
+    }
+#endif
+
+    void clean_all_buffers(void) {
+#ifdef CPPUDDLE_HAVE_COUNTERS
+      if (number_allocation == 0 && number_recycling == 0 &&
+          number_bad_alloc == 0 && number_creation == 0 &&
+          unused_buffer_list.empty() && buffer_map.empty()) {
+        return;
+      }
 #endif
+      for (auto &buffer_tuple : unused_buffer_list) {
+        Host_Allocator alloc;
+        if (std::get<3>(buffer_tuple)) {
+          std::destroy_n(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple));
+        }
+        alloc.deallocate(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple));
+      }
+      for (auto &map_tuple : buffer_map) {
+        auto buffer_tuple = map_tuple.second;
+        Host_Allocator alloc;
+        if (std::get<3>(buffer_tuple)) {
+          std::destroy_n(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple));
+        }
+        alloc.deallocate(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple));
+      }
       unused_buffer_list.clear();
       buffer_map.clear();
 #ifdef CPPUDDLE_HAVE_COUNTERS
@@ -665,6 +793,7 @@ template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
 using aggressive_recycle_std =
     detail::aggressive_recycle_allocator<T, std::allocator<T>>;
 
+inline void print_performance_counters() { detail::buffer_recycler::print_performance_counters(); }
 /// Deletes all buffers (even ones still marked as used), delete the buffer
 /// managers and the recycler itself
 inline void force_cleanup() { detail::buffer_recycler::clean_all(); }
diff --git a/tests/allocator_aligned_test.cpp b/tests/allocator_aligned_test.cpp
index 882a2c0c..9178dfbf 100644
--- a/tests/allocator_aligned_test.cpp
+++ b/tests/allocator_aligned_test.cpp
@@ -92,6 +92,7 @@ int main(int argc, char *argv[]) {
     std::cout << "\n==> Aggressive recycle allocation test took "
               << aggressive_duration << "ms" << std::endl;
   }
+  recycler::print_performance_counters();
   recycler::force_cleanup(); // Cleanup all buffers and the managers for better
                              // comparison
 
@@ -113,6 +114,7 @@ int main(int argc, char *argv[]) {
     std::cout << "\n\n==> Recycle allocation test took " << recycle_duration
               << "ms" << std::endl;
   }
+  recycler::print_performance_counters();
   recycler::force_cleanup(); // Cleanup all buffers and the managers for better
                              // comparison
 
@@ -144,6 +146,7 @@ int main(int argc, char *argv[]) {
     std::cout << "Test information: Recycler was faster than default allocator!"
               << std::endl;
   }
+  recycler::print_performance_counters();
 #ifdef CPPUDDLE_HAVE_HPX  
   return hpx::finalize();
 #else
diff --git a/tests/allocator_hpx_test.cpp b/tests/allocator_hpx_test.cpp
index 4d11cc16..6ddb70ad 100644
--- a/tests/allocator_hpx_test.cpp
+++ b/tests/allocator_hpx_test.cpp
@@ -126,6 +126,7 @@ int hpx_main(int argc, char *argv[]) {
       std::cout << "\n==> Recycle allocation test took " << recycle_duration
                 << "ms" << std::endl;
     }
+    recycler::print_performance_counters();
     recycler::force_cleanup(); // Cleanup all buffers and the managers for better
                                // comparison
 
@@ -166,6 +167,7 @@ int hpx_main(int argc, char *argv[]) {
       std::cout << "\n==> Aggressive recycle allocation test took "
                 << aggressive_duration << "ms" << std::endl;
     }
+    recycler::print_performance_counters();
     recycler::force_cleanup(); // Cleanup all buffers and the managers for better
                                // comparison
 
diff --git a/tests/allocator_kokkos_executor_for_loop_test.cpp b/tests/allocator_kokkos_executor_for_loop_test.cpp
index f2256fdd..7708fe56 100644
--- a/tests/allocator_kokkos_executor_for_loop_test.cpp
+++ b/tests/allocator_kokkos_executor_for_loop_test.cpp
@@ -143,6 +143,7 @@ int main(int argc, char *argv[]) {
 
   // otherwise the HPX cuda polling futures won't work
   hpx::cuda::experimental::detail::unregister_polling(hpx::resource::get_thread_pool(0));
+  recycler::print_performance_counters();
   // Cleanup all cuda views 
   // (otherwise the cuda driver might shut down before this gets done automatically at
   // the end of the programm)
diff --git a/tests/allocator_kokkos_test.cpp b/tests/allocator_kokkos_test.cpp
index 6109f71d..055fa397 100644
--- a/tests/allocator_kokkos_test.cpp
+++ b/tests/allocator_kokkos_test.cpp
@@ -91,6 +91,7 @@ int main(int argc, char *argv[]) {
                         });
     Kokkos::fence();
   }
+  recycler::print_performance_counters();
 #ifdef CPPUDDLE_HAVE_HPX  
   return hpx::finalize();
 #else
diff --git a/tests/allocator_test.cpp b/tests/allocator_test.cpp
index e86f00ce..0d8868db 100644
--- a/tests/allocator_test.cpp
+++ b/tests/allocator_test.cpp
@@ -88,6 +88,7 @@ int main(int argc, char *argv[]) {
     std::cout << "\n\n==> Aggressive recycle allocation test took "
               << aggressive_duration << "ms" << std::endl;
   }
+  recycler::print_performance_counters();
   recycler::force_cleanup(); // Cleanup all buffers and the managers for better
                              // comparison
 
@@ -107,6 +108,7 @@ int main(int argc, char *argv[]) {
     std::cout << "\n\n==> Recycle allocation test took " << recycle_duration
               << "ms" << std::endl;
   }
+  recycler::print_performance_counters();
   recycler::force_cleanup(); // Cleanup all buffers and the managers for better
                              // comparison
 
@@ -136,6 +138,7 @@ int main(int argc, char *argv[]) {
     std::cout << "Test information: Recycler was faster than default allocator!"
               << std::endl;
   }
+  recycler::print_performance_counters();
 #ifdef CPPUDDLE_HAVE_HPX  
   return hpx::finalize();
 #else
diff --git a/tests/work_aggregation_test.cpp b/tests/work_aggregation_test.cpp
index e4050e3f..28a37806 100644
--- a/tests/work_aggregation_test.cpp
+++ b/tests/work_aggregation_test.cpp
@@ -863,6 +863,7 @@ int hpx_main(int argc, char *argv[]) {
   std::flush(hpx::cout);
   sleep(1);
 
+  recycler::print_performance_counters();
   recycler::force_cleanup(); // Cleanup all buffers and the managers
   return hpx::finalize();
 }

From ffdbb7bd50a1ea23be3b5201a7969395cbc4674d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gregor=20Dai=C3=9F?= <Gregor.Daiss+git@gmail.com>
Date: Tue, 15 Aug 2023 14:36:50 -0500
Subject: [PATCH 23/42] change_buckets_for_multigpu

---
 include/buffer_manager.hpp     | 193 +++++++++++++++++++++++++--------
 include/detail/config.hpp      |   2 +-
 include/kokkos_buffer_util.hpp |  22 +++-
 3 files changed, 165 insertions(+), 52 deletions(-)

diff --git a/include/buffer_manager.hpp b/include/buffer_manager.hpp
index f17cc725..9a37a626 100644
--- a/include/buffer_manager.hpp
+++ b/include/buffer_manager.hpp
@@ -74,13 +74,16 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL
 
   template <typename T, typename Host_Allocator>
   static T *get(size_t number_elements, bool manage_content_lifetime = false,
-      std::optional<size_t> location_hint = std::nullopt) {
+      std::optional<size_t> location_hint = std::nullopt, 
+      std::optional<size_t> device_id = std::nullopt) {
+
     return Host_Allocator{}.allocate(number_elements);
   }
   /// Marks an buffer as unused and fit for reusage
   template <typename T, typename Host_Allocator>
   static void mark_unused(T *p, size_t number_elements,
-      std::optional<size_t> location_hint = std::nullopt) {
+      std::optional<size_t> location_hint = std::nullopt,
+      std::optional<size_t> device_id = std::nullopt) {
     return Host_Allocator{}.deallocate(p, number_elements);
   }
 #else
@@ -88,15 +91,18 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL
   /// buffer
   template <typename T, typename Host_Allocator>
   static T *get(size_t number_elements, bool manage_content_lifetime = false,
-      std::optional<size_t> location_hint = std::nullopt) {
-    return buffer_manager<T, Host_Allocator>::get(number_elements,
-                                                  manage_content_lifetime, location_hint);
+      std::optional<size_t> location_hint = std::nullopt, 
+      std::optional<size_t> device_id = std::nullopt) {
+    return buffer_manager<T, Host_Allocator>::get(
+        number_elements, manage_content_lifetime, location_hint, device_id);
   }
   /// Marks an buffer as unused and fit for reusage
   template <typename T, typename Host_Allocator>
   static void mark_unused(T *p, size_t number_elements,
-      std::optional<size_t> location_hint = std::nullopt) {
-    return buffer_manager<T, Host_Allocator>::mark_unused(p, number_elements);
+      std::optional<size_t> location_hint = std::nullopt, 
+      std::optional<size_t> device_id = std::nullopt) {
+    return buffer_manager<T, Host_Allocator>::mark_unused(p, number_elements,
+        location_hint, device_id);
   }
 #endif
   template <typename T, typename Host_Allocator>
@@ -104,11 +110,12 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL
 #ifdef CPPUDDLE_HAVE_COUNTERS
       buffer_manager<T, Host_Allocator>::register_counters_with_hpx();
 #else
-    std::cerr << "Warning: Trying to register allocator performance counters with HPX but CPPuddle was built "
+    std::cerr << "Warning: Trying to register allocator performance counters "
+                 "with HPX but CPPuddle was built "
                  "without CPPUDDLE_WITH_COUNTERS -- operation will be ignored!"
               << std::endl;
 #endif
-    }
+  }
 
   /// Deallocate all buffers, no matter whether they are marked as used or not
   static void clean_all() {
@@ -215,14 +222,14 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL
     /// Cleanup and delete this singleton
     static void clean() {
       assert(instance() && !is_finalized);
-      for (auto i = 0; i < number_instances; i++) {
+      for (auto i = 0; i < number_instances * max_number_gpus; i++) {
         std::lock_guard<mutex_t> guard(instance()[i].mut);
         instance()[i].clean_all_buffers();
       }
     }
     static void print_performance_counters() {
       assert(instance() && !is_finalized);
-      for (auto i = 0; i < number_instances; i++) {
+      for (auto i = 0; i < number_instances * max_number_gpus; i++) {
         std::lock_guard<mutex_t> guard(instance()[i].mut);
         instance()[i].print_counters();
       }
@@ -230,7 +237,7 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL
     static void finalize() {
       assert(instance() && !is_finalized);
       is_finalized = true;
-      for (auto i = 0; i < number_instances; i++) {
+      for (auto i = 0; i < number_instances * max_number_gpus; i++) {
         std::lock_guard<mutex_t> guard(instance()[i].mut);
         instance()[i].clean_all_buffers();
       }
@@ -239,7 +246,7 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL
     /// Cleanup all buffers not currently in use
     static void clean_unused_buffers_only() {
       assert(instance() && !is_finalized);
-      for (auto i = 0; i < number_instances; i++) {
+      for (auto i = 0; i < number_instances * max_number_gpus; i++) {
         std::lock_guard<mutex_t> guard(instance()[i].mut);
         for (auto &buffer_tuple : instance()[i].unused_buffer_list) {
           Host_Allocator alloc;
@@ -319,7 +326,8 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL
 
     /// Tries to recycle or create a buffer of type T and size number_elements.
     static T *get(size_t number_of_elements, bool manage_content_lifetime,
-        std::optional<size_t> location_hint = std::nullopt) {
+        std::optional<size_t> location_hint = std::nullopt,
+        std::optional<size_t> gpu_device_id = std::nullopt) {
       init_callbacks_once();
       if (is_finalized) {
         throw std::runtime_error("Tried allocation after finalization");
@@ -328,11 +336,22 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL
 
       size_t location_id = 0;
       if (location_hint) {
-        location_id = location_hint.value();
+        location_id = *location_hint;
       }
       if (location_id >= number_instances) {
         throw std::runtime_error("Tried to create buffer with invalid location_id [get]");
       }
+      size_t device_id = 0;
+      if (gpu_device_id) {
+        device_id = *gpu_device_id;
+      }
+      if (device_id >= max_number_gpus) {
+        throw std::runtime_error("Tried to create buffer with invalid device id [get]! "
+                                 "Is multigpu support enabled with the correct number "
+                                 "of GPUs?");
+      }
+
+      location_id = location_id + device_id * number_instances;
       std::lock_guard<mutex_t> guard(instance()[location_id].mut);
 
 
@@ -369,7 +388,7 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL
       // No unused buffer found -> Create new one and return it
       try {
         recycler::device_selection::select_device_functor<T, Host_Allocator>{}(
-            location_id / instances_per_gpu);
+            device_id);
         Host_Allocator alloc;
         T *buffer = alloc.allocate(number_of_elements);
         instance()[location_id].buffer_map.insert(
@@ -395,7 +414,7 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL
         // We've done all we can in here
         Host_Allocator alloc;
         recycler::device_selection::select_device_functor<T, Host_Allocator>{}(
-            location_id / instances_per_gpu);
+            device_id);
         T *buffer = alloc.allocate(number_of_elements);
         instance()[location_id].buffer_map.insert(
             {buffer, std::make_tuple(buffer, number_of_elements, 1,
@@ -415,17 +434,32 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL
     }
 
     static void mark_unused(T *memory_location, size_t number_of_elements,
-        std::optional<size_t> location_hint = std::nullopt) {
+        std::optional<size_t> location_hint = std::nullopt,
+        std::optional<size_t> device_hint = std::nullopt) {
       if (is_finalized)
         return;
       assert(instance() && !is_finalized);
 
+      size_t location_id = 0;
       if (location_hint) {
-        size_t location_id = location_hint.value();
+        location_id = *location_hint;
         if (location_id >= number_instances) {
           throw std::runtime_error(
               "Buffer recylcer received invalid location hint [mark_unused]");
         }
+      }
+      size_t device_id = 0;
+      if (device_hint) {
+        device_id = *device_hint;
+        if (device_id >= max_number_gpus) {
+          throw std::runtime_error(
+              "Buffer recylcer received invalid devce hint [mark_unused]");
+        }
+      }
+
+      // Attempt 1 to find the correct bucket/location: Look at provided hint:
+      if (location_hint) {
+        size_t location_id = location_hint.value() + device_id * number_instances;
         std::lock_guard<mutex_t> guard(instance()[location_id].mut);
         if (instance()[location_id].buffer_map.find(memory_location) !=
             instance()[location_id].buffer_map.end()) {
@@ -443,19 +477,20 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL
           instance()[location_id].buffer_map.erase(memory_location);
           return; // Success
         }
-        // hint was wrong - note that, and continue on with all other buffer
-        // managers
+        // hint was wrong 
 #ifdef CPPUDDLE_HAVE_COUNTERS
         instance()[location_id].number_wrong_hints++;
         sum_number_wrong_hints++;
 #endif
       }
-
-      for(size_t location_id = 0; location_id < number_instances; location_id++) {
+      // Failed to find buffer in the specified localtion/device!
+      // Attempt 2 - Look for buffer other locations on the same device...
+      for (size_t location_id = device_id * number_instances;
+           location_id < (device_id + 1) * number_instances; location_id++) {
         if (location_hint) {
-           if (location_hint.value() == location_id) {
-             continue; // already tried this -> skip
-           }
+          if (*location_hint + device_id * max_number_gpus  == location_id) {
+            continue; // already tried this -> skip
+          }
         }
         std::lock_guard<mutex_t> guard(instance()[location_id].mut);
         if (instance()[location_id].buffer_map.find(memory_location) !=
@@ -475,6 +510,64 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL
           return; // Success
         }
       }
+      // Failed to find buffer on the specified device!
+      // Attempt 3 - Look for buffer on other devices...
+      for (size_t local_device_id = 0; local_device_id < max_number_gpus;
+           local_device_id++) {
+        if (local_device_id == device_id)
+          continue; // aldready tried this device
+
+        // Try hint localtion first yet again (though on different device)
+        if (location_hint) {
+          size_t location_id = location_hint.value() + local_device_id * number_instances;
+          std::lock_guard<mutex_t> guard(instance()[location_id].mut);
+          if (instance()[location_id].buffer_map.find(memory_location) !=
+              instance()[location_id].buffer_map.end()) {
+#ifdef CPPUDDLE_HAVE_COUNTERS
+            instance()[location_id].number_deallocation++;
+            sum_number_deallocation++;
+#endif
+            auto it = instance()[location_id].buffer_map.find(memory_location);
+            assert(it != instance()[location_id].buffer_map.end());
+            auto &tuple = it->second;
+            // sanity checks:
+            assert(std::get<1>(tuple) == number_of_elements);
+            // move to the unused_buffer list
+            instance()[location_id].unused_buffer_list.push_front(tuple);
+            instance()[location_id].buffer_map.erase(memory_location);
+            return; // Success
+          }
+        }
+        // Failed - check all other localtions on device
+        for (size_t location_id = local_device_id * number_instances;
+             location_id < (local_device_id + 1) * number_instances; location_id++) {
+          if (location_hint) {
+            if (*location_hint + local_device_id * max_number_gpus == location_id) {
+              continue; // already tried this -> skip
+            }
+          }
+          std::lock_guard<mutex_t> guard(instance()[location_id].mut);
+          if (instance()[location_id].buffer_map.find(memory_location) !=
+              instance()[location_id].buffer_map.end()) {
+#ifdef CPPUDDLE_HAVE_COUNTERS
+            instance()[location_id].number_deallocation++;
+            sum_number_deallocation++;
+#endif
+            auto it = instance()[location_id].buffer_map.find(memory_location);
+            assert(it != instance()[location_id].buffer_map.end());
+            auto &tuple = it->second;
+            // sanity checks:
+            assert(std::get<1>(tuple) == number_of_elements);
+            // move to the unused_buffer list
+            instance()[location_id].unused_buffer_list.push_front(tuple);
+            instance()[location_id].buffer_map.erase(memory_location);
+            return; // Success
+          }
+        }
+      }
+      // Buffer that is to be deleted is nowhere to be found - we looked everywhere!
+      // =>
+      // Failure! Handle here...
 
       // TODO Throw exception instead in the futures, as soon as the recycler finalize is 
       // in all user codes
@@ -488,7 +581,7 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL
           << "Warning! Tried to delete non-existing buffer within CPPuddle!"
           << std::endl;
       std::cerr << "Did you forget to call recycler::finalize?" << std::endl;
-    }
+      }
 
   private:
     /// List with all buffers still in usage
@@ -516,7 +609,7 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL
     operator=(buffer_manager<T, Host_Allocator> &&other) = delete;
     static std::unique_ptr<buffer_manager[]>& instance(void) {
       static std::unique_ptr<buffer_manager[]> instances{
-          new buffer_manager[number_instances]};
+          new buffer_manager[number_instances * max_number_gpus]};
       return instances;
     }
     static void init_callbacks_once(void) {
@@ -544,6 +637,8 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL
 
 #ifdef CPPUDDLE_HAVE_COUNTERS
     void print_counters(void) {
+      if (number_allocation == 0)
+        return;
       // Print performance counters
       size_t number_cleaned = unused_buffer_list.size() + buffer_map.size();
       std::cout << "\nBuffer manager destructor for (Alloc: "
@@ -642,15 +737,16 @@ template <typename T, typename Host_Allocator> struct recycle_allocator {
   using underlying_allocator_type = Host_Allocator;
   static_assert(std::is_same_v<value_type, typename underlying_allocator_type::value_type>);
   const std::optional<size_t> dealloc_hint;
+  const std::optional<size_t> device_id;
 
 #ifndef CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS
   recycle_allocator() noexcept
-      : dealloc_hint(std::nullopt) {}
+      : dealloc_hint(std::nullopt), device_id(std::nullopt) {}
   explicit recycle_allocator(size_t hint) noexcept
-      : dealloc_hint(std::nullopt) {}
+      : dealloc_hint(std::nullopt), device_id(std::nullopt) {}
   explicit recycle_allocator(
       recycle_allocator<T, Host_Allocator> const &other) noexcept
-      : dealloc_hint(std::nullopt) {}
+      : dealloc_hint(std::nullopt), device_id(std::nullopt) {}
   T *allocate(std::size_t n) {
     T *data = buffer_recycler::get<T, Host_Allocator>(n);
     return data;
@@ -660,19 +756,20 @@ template <typename T, typename Host_Allocator> struct recycle_allocator {
   }
 #else
   recycle_allocator() noexcept
-      : dealloc_hint(hpx::get_worker_thread_num()) {}
-  explicit recycle_allocator(size_t hint) noexcept
-      : dealloc_hint(hint) {}
+      : dealloc_hint(hpx::get_worker_thread_num()), device_id(0) {}
+  explicit recycle_allocator(const size_t device_id) noexcept
+      : dealloc_hint(hint), device_id(device_id) {}
   explicit recycle_allocator(
       recycle_allocator<T, Host_Allocator> const &other) noexcept
-  : dealloc_hint(other.dealloc_hint) {}
+  : dealloc_hint(other.dealloc_hint), device_id(other.device_id) {}
   T *allocate(std::size_t n) {
     T *data = buffer_recycler::get<T, Host_Allocator>(
-        n, false, hpx::get_worker_thread_num());
+        n, false, hpx::get_worker_thread_num(), device_id);
     return data;
   }
   void deallocate(T *p, std::size_t n) {
-    buffer_recycler::mark_unused<T, Host_Allocator>(p, n, dealloc_hint);
+    buffer_recycler::mark_unused<T, Host_Allocator>(p, n, dealloc_hint,
+                                                    device_id);
   }
 #endif
 
@@ -707,16 +804,17 @@ struct aggressive_recycle_allocator {
   using value_type = T;
   using underlying_allocator_type = Host_Allocator;
   static_assert(std::is_same_v<value_type, typename underlying_allocator_type::value_type>);
-  std::optional<size_t> dealloc_hint;
+  const std::optional<size_t> dealloc_hint;
+  const std::optional<size_t> device_id;
 
 #ifndef CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS
   aggressive_recycle_allocator() noexcept
-      : dealloc_hint(std::nullopt) {}
+      : dealloc_hint(std::nullopt), device_id(std::nullopt) {}
   explicit aggressive_recycle_allocator(size_t hint) noexcept
-      : dealloc_hint(std::nullopt) {}
+      : dealloc_hint(std::nullopt), device_id(std::nullopt) {}
   explicit aggressive_recycle_allocator(
       aggressive_recycle_allocator<T, Host_Allocator> const &) noexcept 
-  : dealloc_hint(std::nullopt) {}
+  : dealloc_hint(std::nullopt), device_id(std::nullopt) {}
   T *allocate(std::size_t n) {
     T *data = buffer_recycler::get<T, Host_Allocator>(
         n, true); // also initializes the buffer if it isn't reused
@@ -727,20 +825,21 @@ struct aggressive_recycle_allocator {
   }
 #else
   aggressive_recycle_allocator() noexcept
-      : dealloc_hint(hpx::get_worker_thread_num()) {}
-  explicit aggressive_recycle_allocator(size_t hint) noexcept
-      : dealloc_hint(hint) {}
+      : dealloc_hint(hpx::get_worker_thread_num()), device_id(0) {}
+  explicit aggressive_recycle_allocator(const size_t device_id) noexcept
+      : device_id(device_id) {}
   explicit aggressive_recycle_allocator(
       recycle_allocator<T, Host_Allocator> const &other) noexcept 
-    : dealloc_hint(other.dealloc_hint) {}
+    : dealloc_hint(other.dealloc_hint), device_id(other.device_id) {}
   T *allocate(std::size_t n) {
     T *data = buffer_recycler::get<T, Host_Allocator>(
-        n, true, hpx::get_worker_thread_num()); // also initializes the buffer
+        n, true, dealloc_hint, device_id); // also initializes the buffer
                                                 // if it isn't reused
     return data;
   }
   void deallocate(T *p, std::size_t n) {
-    buffer_recycler::mark_unused<T, Host_Allocator>(p, n, dealloc_hint);
+    buffer_recycler::mark_unused<T, Host_Allocator>(p, n, dealloc_hint,
+                                                    device_id);
   }
 #endif
 
diff --git a/include/detail/config.hpp b/include/detail/config.hpp
index 0244ae9b..56ac7b8d 100644
--- a/include/detail/config.hpp
+++ b/include/detail/config.hpp
@@ -51,7 +51,7 @@ static_assert(max_number_gpus == 1, "Non HPX builds do not support multigpu");
 static_assert(number_instances >= max_number_gpus);
 static_assert(max_number_gpus > 0);
 static_assert(number_instances > 0);
-constexpr size_t instances_per_gpu = number_instances / max_number_gpus;
+//constexpr size_t instances_per_gpu = number_instances / max_number_gpus;
 
 /// Uses HPX thread information to determine which GPU should be used
 inline size_t get_device_id(void) {
diff --git a/include/kokkos_buffer_util.hpp b/include/kokkos_buffer_util.hpp
index 52413cce..f6cf04fa 100644
--- a/include/kokkos_buffer_util.hpp
+++ b/include/kokkos_buffer_util.hpp
@@ -101,15 +101,29 @@ class recycled_view : public kokkos_type {
         data_ref_counter(this->data(), view_deleter<element_type, alloc_type>(
                                            allocator, total_elements)) {}
 
+  // TODO Add version with only a device parameter -- should use get but come with a different
+  // view deleter that just uses mark unused
+
+
+  // TODO NExt up: Add similar mechanism to aggregatation manager
+  
+
+  // TODO Add similar mechanism to cuda_device_buffer
+  
+
+  // TODO Switch Octo-Tiger hydro kokkos solver to this (should mostly just
+  // require 
+
+  // TODO These are meant to get the static data (predicatable location_id really required?)
   template <typename... Args,
             std::enable_if_t<sizeof...(Args) == kokkos_type::rank, bool> = true>
-  recycled_view(std::size_t location_id, Args... args)
+  recycled_view(std::size_t device_id, std::size_t location_id, Args... args)
       : kokkos_type(
             detail::buffer_recycler::get<
                 element_type, typename alloc_type::underlying_allocator_type>(
                 kokkos_type::required_allocation_size(args...) /
                     sizeof(element_type),
-                false, location_id),
+                false, location_id, device_id),
             args...),
         total_elements(kokkos_type::required_allocation_size(args...) /
                        sizeof(element_type)),
@@ -119,13 +133,13 @@ class recycled_view : public kokkos_type {
   template <
       typename layout_t,
       std::enable_if_t<Kokkos::is_array_layout<layout_t>::value, bool> = true>
-  recycled_view(std::size_t location_id, layout_t layout)
+  recycled_view(std::size_t device_id, std::size_t location_id, layout_t layout)
       : kokkos_type(
             detail::buffer_recycler::get<
                 element_type, typename alloc_type::underlying_allocator_type>(
                 kokkos_type::required_allocation_size(layout) /
                     sizeof(element_type),
-                false, location_id),
+                false, location_id, device_id),
             layout),
         total_elements(kokkos_type::required_allocation_size(layout) /
                        sizeof(element_type)),

From 7880d4b9c71eebeb60ea7840479daf49c90cac87 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gregor=20Dai=C3=9F?= <Gregor.Daiss+git@gmail.com>
Date: Fri, 18 Aug 2023 12:52:13 -0500
Subject: [PATCH 24/42] Remove location_id from user interface

---
 include/aggregation_manager.hpp | 17 ++++++++-----
 include/buffer_manager.hpp      | 37 ++++++++++++++++++++-------
 include/cuda_buffer_util.hpp    | 41 +++++++++---------------------
 include/detail/config.hpp       |  3 ++-
 include/kokkos_buffer_util.hpp  | 45 ++++++++++-----------------------
 5 files changed, 65 insertions(+), 78 deletions(-)

diff --git a/include/aggregation_manager.hpp b/include/aggregation_manager.hpp
index e687e106..8961226e 100644
--- a/include/aggregation_manager.hpp
+++ b/include/aggregation_manager.hpp
@@ -532,9 +532,9 @@ template <typename Executor> class Aggregated_Executor {
   aggregation_mutex_t mut;
 
   /// Data entry for a buffer allocation: void* pointer, size_t for
-  /// buffer-size, atomic for the slice counter
+  /// buffer-size, atomic for the slice counter, location_id, gpu_id
   using buffer_entry_t =
-      std::tuple<void*, const size_t, std::atomic<size_t>, bool, const size_t>;
+      std::tuple<void*, const size_t, std::atomic<size_t>, bool, const size_t, size_t>;
   /// Keeps track of the aggregated buffer allocations done in all the slices
   std::deque<buffer_entry_t> buffer_allocations;
   /// Map pointer to deque index for fast access in the deallocations
@@ -560,14 +560,16 @@ template <typename Executor> class Aggregated_Executor {
 
         // Default location -- useful for GPU builds as we otherwise create way too
         // many different buffers for different aggregation sizes on different GPUs
-        size_t location_id = gpu_id * instances_per_gpu;
+        /* size_t location_id = gpu_id * instances_per_gpu; */
+        // Use integer conversion to only use 0 16 32 ... as buckets
+        size_t location_id = (hpx::get_worker_thread_num() / 16) * 16; 
 #ifdef CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS
         if (max_slices == 1) {
           // get prefered location: aka the current hpx threads location
           // Usually handy for CPU builds where we want to use the buffers
           // close to the current CPU core
           /* location_id = (hpx::get_worker_thread_num() / instances_per_gpu) * instances_per_gpu; */
-          location_id = (gpu_id) * instances_per_gpu;
+          /* location_id = (gpu_id) * instances_per_gpu; */
           // division makes sure that we always use the same instance to store our gpu buffers.
         }
 #endif
@@ -576,10 +578,10 @@ template <typename Executor> class Aggregated_Executor {
         // buffer_recycler...
         T *aggregated_buffer =
             recycler::detail::buffer_recycler::get<T, Host_Allocator>(
-                size, manage_content_lifetime, location_id);
+                size, manage_content_lifetime, location_id, gpu_id);
         // Create buffer entry for this buffer
         buffer_allocations.emplace_back(static_cast<void *>(aggregated_buffer),
-                                        size, 1, true, location_id);
+                                        size, 1, true, location_id, gpu_id);
 
 #ifndef NDEBUG
         // if previousely used the buffer should not be in usage anymore
@@ -633,6 +635,7 @@ template <typename Executor> class Aggregated_Executor {
     auto &buffer_allocation_counter = std::get<2>(buffer_allocations[slice_alloc_counter]);
     auto &valid = std::get<3>(buffer_allocations[slice_alloc_counter]);
     const auto &location_id = std::get<4>(buffer_allocations[slice_alloc_counter]);
+    const auto &gpu_id = std::get<5>(buffer_allocations[slice_alloc_counter]);
     assert(valid);
     T *buffer_pointer = static_cast<T *>(buffer_pointer_void);
 
@@ -650,7 +653,7 @@ template <typename Executor> class Aggregated_Executor {
       if (valid) {
         assert(buffers_in_use == true);
         recycler::detail::buffer_recycler::mark_unused<T, Host_Allocator>(
-            buffer_pointer, buffer_size, location_id);
+            buffer_pointer, buffer_size, location_id, gpu_id);
         // mark buffer as invalid to prevent any other slice from marking the
         // buffer as unused
         valid = false;
diff --git a/include/buffer_manager.hpp b/include/buffer_manager.hpp
index 9a37a626..ab4b5144 100644
--- a/include/buffer_manager.hpp
+++ b/include/buffer_manager.hpp
@@ -284,6 +284,11 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL
         sum_number_wrong_hints = 0;
       return sum_number_wrong_hints;
     }
+    static size_t get_sum_number_wrong_device_hints(bool reset) {
+      if (reset)
+        sum_number_wrong_hints = 0;
+      return sum_number_wrong_device_hints;
+    }
     static size_t get_sum_number_bad_allocs(bool reset) {
       if (reset)
         sum_number_bad_allocs = 0;
@@ -317,6 +322,10 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL
           std::string("/cppuddle/allocators/") + alloc_name + std::string("/number_wrong_hints/"),
           &get_sum_number_wrong_hints,
           "Number of wrong hints supplied to the dealloc method with this allocator");
+      hpx::performance_counters::install_counter_type(
+          std::string("/cppuddle/allocators/") + alloc_name + std::string("/number_wrong_device_hints/"),
+          &get_sum_number_wrong_device_hints,
+          "Number of wrong device hints supplied to the dealloc method with this allocator");
       hpx::performance_counters::install_counter_type(
           std::string("/cppuddle/allocators/") + alloc_name + std::string("/number_bad_allocs/"),
           &get_sum_number_bad_allocs,
@@ -510,6 +519,12 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL
           return; // Success
         }
       }
+      // device hint was wrong 
+#ifdef CPPUDDLE_HAVE_COUNTERS
+      if (device_hint) {
+        sum_number_wrong_device_hints++;
+      }
+#endif
       // Failed to find buffer on the specified device!
       // Attempt 3 - Look for buffer on other devices...
       for (size_t local_device_id = 0; local_device_id < max_number_gpus;
@@ -592,17 +607,17 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL
     mutex_t mut;
 #ifdef CPPUDDLE_HAVE_COUNTERS
     /// Performance counters
-    size_t number_allocation{0}, number_deallocation{0}, number_wrong_hints{0};
-    size_t number_recycling{0}, number_creation{0}, number_bad_alloc{0};
+    size_t number_allocation{0}, number_deallocation{0}, number_wrong_hints{0},
+        number_recycling{0}, number_creation{0}, number_bad_alloc{0};
 
     static inline std::atomic<size_t> sum_number_allocation{0},
-        sum_number_deallocation{0}, sum_number_wrong_hints{0};
-    static inline std::atomic<size_t> sum_number_recycling{0},
+        sum_number_deallocation{0}, sum_number_wrong_hints{0},
+        sum_number_wrong_device_hints{0}, sum_number_recycling{0},
         sum_number_creation{0}, sum_number_bad_allocs{0};
 #endif
-    /// default, private constructor - not automatically constructed due to the
-    /// deleted constructors
-    buffer_manager() = default;
+        /// default, private constructor - not automatically constructed due to
+        /// the deleted constructors
+        buffer_manager() = default;
     buffer_manager&
     operator=(buffer_manager<T, Host_Allocator> const &other) = default;
     buffer_manager&
@@ -758,7 +773,9 @@ template <typename T, typename Host_Allocator> struct recycle_allocator {
   recycle_allocator() noexcept
       : dealloc_hint(hpx::get_worker_thread_num()), device_id(0) {}
   explicit recycle_allocator(const size_t device_id) noexcept
-      : dealloc_hint(hint), device_id(device_id) {}
+      : dealloc_hint(hpx::get_worker_thread_num()), device_id(device_id) {}
+  explicit recycle_allocator(const size_t device_i, const size_t location_id) noexcept
+      : dealloc_hint(location_id), device_id(device_id) {}
   explicit recycle_allocator(
       recycle_allocator<T, Host_Allocator> const &other) noexcept
   : dealloc_hint(other.dealloc_hint), device_id(other.device_id) {}
@@ -827,7 +844,9 @@ struct aggressive_recycle_allocator {
   aggressive_recycle_allocator() noexcept
       : dealloc_hint(hpx::get_worker_thread_num()), device_id(0) {}
   explicit aggressive_recycle_allocator(const size_t device_id) noexcept
-      : device_id(device_id) {}
+      : dealloc_hint(hpx::get_worker_thread_num()), device_id(device_id) {}
+  explicit aggressive_recycle_allocator(const size_t device_id, const size_t location_id) noexcept
+      : dealloc_hint(location_id), device_id(device_id) {}
   explicit aggressive_recycle_allocator(
       recycle_allocator<T, Host_Allocator> const &other) noexcept 
     : dealloc_hint(other.dealloc_hint), device_id(other.device_id) {}
diff --git a/include/cuda_buffer_util.hpp b/include/cuda_buffer_util.hpp
index 8fb57d4f..e454d8c0 100644
--- a/include/cuda_buffer_util.hpp
+++ b/include/cuda_buffer_util.hpp
@@ -111,24 +111,18 @@ using recycle_allocator_cuda_device =
 
 template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
 struct cuda_device_buffer {
-  size_t gpu_id{0};
+  recycle_allocator_cuda_device<T> allocator;
   T *device_side_buffer;
   size_t number_of_elements;
-  explicit cuda_device_buffer(size_t number_of_elements)
-      : number_of_elements(number_of_elements) {
-    device_side_buffer =
-        recycle_allocator_cuda_device<T>{}.allocate(number_of_elements);
-  }
-  // TODO deprecate and remove gpu_id
-  explicit cuda_device_buffer(size_t number_of_elements, size_t gpu_id)
-      : gpu_id(gpu_id), number_of_elements(number_of_elements), set_id(true) {
-    assert(gpu_id == 0);
+
+  cuda_device_buffer(const size_t number_of_elements, const size_t device_id = 0)
+      : allocator{device_id}, number_of_elements(number_of_elements) {
+    assert(device_id < max_number_gpus);
     device_side_buffer =
-        recycle_allocator_cuda_device<T>{}.allocate(number_of_elements);
+        allocator.allocate(number_of_elements);
   }
   ~cuda_device_buffer() {
-    recycle_allocator_cuda_device<T>{}.deallocate(device_side_buffer,
-                                                  number_of_elements);
+    allocator.deallocate(device_side_buffer, number_of_elements);
   }
   // not yet implemented
   cuda_device_buffer(cuda_device_buffer const &other) = delete;
@@ -136,30 +130,19 @@ struct cuda_device_buffer {
   cuda_device_buffer(cuda_device_buffer const &&other) = delete;
   cuda_device_buffer operator=(cuda_device_buffer const &&other) = delete;
 
-private:
-  bool set_id{false};
 };
 
 template <typename T, typename Host_Allocator, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
 struct cuda_aggregated_device_buffer {
-  size_t gpu_id{0};
   T *device_side_buffer;
   size_t number_of_elements;
-  explicit cuda_aggregated_device_buffer(size_t number_of_elements)
-      : number_of_elements(number_of_elements) {
-    device_side_buffer =
-        recycle_allocator_cuda_device<T>{}.allocate(number_of_elements);
-  }
-  // TODO deprecate and remove gpu_id
-  explicit cuda_aggregated_device_buffer(size_t number_of_elements, size_t gpu_id, Host_Allocator &alloc)
-      : gpu_id(gpu_id), number_of_elements(number_of_elements), set_id(true), alloc(alloc) {
-    assert(gpu_id == 0);
+  explicit cuda_aggregated_device_buffer(size_t number_of_elements, Host_Allocator &alloc)
+      : number_of_elements(number_of_elements), alloc(alloc) {
     device_side_buffer =
         alloc.allocate(number_of_elements);
   }
   ~cuda_aggregated_device_buffer() {
-    alloc.deallocate(device_side_buffer,
-                                                  number_of_elements);
+    alloc.deallocate(device_side_buffer, number_of_elements);
   }
   // not yet implemented
   cuda_aggregated_device_buffer(cuda_aggregated_device_buffer const &other) = delete;
@@ -168,8 +151,8 @@ struct cuda_aggregated_device_buffer {
   cuda_aggregated_device_buffer operator=(cuda_aggregated_device_buffer const &&other) = delete;
 
 private:
-  bool set_id{false};
-  Host_Allocator &alloc;
+  Host_Allocator &alloc; // will stay valid for the entire aggregation region and hence
+                         // for the entire lifetime of this buffer
 };
 
 namespace device_selection {
diff --git a/include/detail/config.hpp b/include/detail/config.hpp
index 56ac7b8d..ea6e9a26 100644
--- a/include/detail/config.hpp
+++ b/include/detail/config.hpp
@@ -56,7 +56,8 @@ static_assert(number_instances > 0);
 /// Uses HPX thread information to determine which GPU should be used
 inline size_t get_device_id(void) {
 #if defined(CPPUDDLE_HAVE_HPX) 
-    return hpx::get_worker_thread_num() / instances_per_gpu; 
+    //return hpx::get_worker_thread_num() / max_num_gpus; 
+    return 0; 
 #else
     return 0;
 #endif
diff --git a/include/kokkos_buffer_util.hpp b/include/kokkos_buffer_util.hpp
index f6cf04fa..2945b422 100644
--- a/include/kokkos_buffer_util.hpp
+++ b/include/kokkos_buffer_util.hpp
@@ -16,7 +16,7 @@ template<typename element_type, typename alloc_type>
 struct view_deleter {
   alloc_type allocator;
   size_t total_elements;
-  view_deleter(alloc_type &alloc, size_t total_elements) : allocator(alloc),
+  view_deleter(alloc_type alloc, size_t total_elements) : allocator(alloc),
     total_elements(total_elements) {}
   void operator()(element_type* p) {
     allocator.deallocate(p, total_elements);
@@ -29,6 +29,7 @@ class aggregated_recycled_view : public kokkos_type {
   alloc_type allocator;
   size_t total_elements{0};
   std::shared_ptr<element_type> data_ref_counter;
+  static_assert(std::is_same_v<element_type, typename alloc_type::value_type>);
 
 public:
   using view_type = kokkos_type;
@@ -79,10 +80,10 @@ class aggregated_recycled_view : public kokkos_type {
   ~aggregated_recycled_view() {}
 };
 
+
 template <typename kokkos_type, typename alloc_type, typename element_type>
 class recycled_view : public kokkos_type {
 private:
-  alloc_type allocator;
   size_t total_elements{0};
   std::shared_ptr<element_type> data_ref_counter;
 
@@ -93,58 +94,38 @@ class recycled_view : public kokkos_type {
             std::enable_if_t<sizeof...(Args) == kokkos_type::rank, bool> = true>
   recycled_view(Args... args)
       : kokkos_type(
-            allocator.allocate(kokkos_type::required_allocation_size(args...) /
+            alloc_type{}.allocate(kokkos_type::required_allocation_size(args...) /
                                sizeof(element_type)),
             args...),
         total_elements(kokkos_type::required_allocation_size(args...) /
                        sizeof(element_type)),
         data_ref_counter(this->data(), view_deleter<element_type, alloc_type>(
-                                           allocator, total_elements)) {}
-
-  // TODO Add version with only a device parameter -- should use get but come with a different
-  // view deleter that just uses mark unused
-
+                                           alloc_type{}, total_elements)) {}
 
-  // TODO NExt up: Add similar mechanism to aggregatation manager
-  
-
-  // TODO Add similar mechanism to cuda_device_buffer
-  
-
-  // TODO Switch Octo-Tiger hydro kokkos solver to this (should mostly just
-  // require 
-
-  // TODO These are meant to get the static data (predicatable location_id really required?)
   template <typename... Args,
             std::enable_if_t<sizeof...(Args) == kokkos_type::rank, bool> = true>
-  recycled_view(std::size_t device_id, std::size_t location_id, Args... args)
+  recycled_view(const size_t device_id, Args... args)
       : kokkos_type(
-            detail::buffer_recycler::get<
-                element_type, typename alloc_type::underlying_allocator_type>(
-                kokkos_type::required_allocation_size(args...) /
-                    sizeof(element_type),
-                false, location_id, device_id),
+            alloc_type{device_id}.allocate(kokkos_type::required_allocation_size(args...) /
+                               sizeof(element_type)),
             args...),
         total_elements(kokkos_type::required_allocation_size(args...) /
                        sizeof(element_type)),
         data_ref_counter(this->data(), view_deleter<element_type, alloc_type>(
-                                           allocator, total_elements)) {}
+                                           alloc_type{device_id}, total_elements)) {}
 
   template <
       typename layout_t,
       std::enable_if_t<Kokkos::is_array_layout<layout_t>::value, bool> = true>
-  recycled_view(std::size_t device_id, std::size_t location_id, layout_t layout)
+  recycled_view(std::size_t device_id, layout_t layout)
       : kokkos_type(
-            detail::buffer_recycler::get<
-                element_type, typename alloc_type::underlying_allocator_type>(
-                kokkos_type::required_allocation_size(layout) /
-                    sizeof(element_type),
-                false, location_id, device_id),
+            alloc_type{device_id}.allocate(kokkos_type::required_allocation_size(layout) /
+                               sizeof(element_type)),
             layout),
         total_elements(kokkos_type::required_allocation_size(layout) /
                        sizeof(element_type)),
         data_ref_counter(this->data(), view_deleter<element_type, alloc_type>(
-                                           allocator, total_elements)) {}
+                                           alloc_type{device_id}, total_elements)) {}
 
   recycled_view(
       const recycled_view<kokkos_type, alloc_type, element_type> &other)

From f3d71c83d3e0cb82d97f3ddcdce1e07c5d3dea1c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gregor=20Dai=C3=9F?= <Gregor.Daiss+git@gmail.com>
Date: Fri, 18 Aug 2023 13:36:59 -0500
Subject: [PATCH 25/42] WIP, add gpu_id to stream managers

---
 include/aggregation_manager.hpp | 24 +++++++++++++++++++++---
 include/detail/config.hpp       |  6 +++---
 2 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/include/aggregation_manager.hpp b/include/aggregation_manager.hpp
index 8961226e..4d91b014 100644
--- a/include/aggregation_manager.hpp
+++ b/include/aggregation_manager.hpp
@@ -6,6 +6,7 @@
 #ifndef WORK_AGGREGATION_MANAGER
 #define WORK_AGGREGATION_MANAGER
 
+#include <stdexcept>
 #define DEBUG_AGGREGATION_CALLS 1
 
 #include <stdio.h>
@@ -1014,8 +1015,23 @@ class aggregation_pool {
   /// interface
   template <typename... Ts>
   static void init(size_t number_of_executors, size_t slices_per_executor,
-                   Aggregated_Executor_Modes mode) {
-    for (size_t gpu_id = 0; gpu_id < max_number_gpus; gpu_id++) {
+                   Aggregated_Executor_Modes mode, std::optional<size_t> overwrite_number_devices) {
+    if (is_initialized) {
+      throw std::runtime_error(
+          std::string("Trying to initialize cppuddle aggregation pool twice") +
+          " Kernel: " + std::string(kernelname));
+    }
+    if (number_devices) {
+      if (*overwrite_number_devices > max_number_gpus) {
+        throw std::runtime_error(
+            std::string(
+                "Trying to initialize aggregation with more devices than the "
+                "maximum number of GPUs given at compiletime") +
+            " Kernel: " + std::string(kernelname));
+      }
+      number_devices = *overwrite_number_devices;
+    }
+    for (size_t gpu_id = 0; gpu_id < number_devices; gpu_id++) {
       std::lock_guard<aggregation_mutex_t> guard(instance()[gpu_id].pool_mutex);
       assert(instance()[gpu_id].aggregation_executor_pool.empty());
       for (int i = 0; i < number_of_executors; i++) {
@@ -1029,7 +1045,7 @@ class aggregation_pool {
 
   /// Will always return a valid executor slice
   static decltype(auto) request_executor_slice(void) {
-    const size_t gpu_id = get_device_id();
+    const size_t gpu_id = get_device_id(number_devices);
     /* const size_t gpu_id = 1; */
     std::lock_guard<aggregation_mutex_t> guard(instance()[gpu_id].pool_mutex);
     assert(!instance()[gpu_id].aggregation_executor_pool.empty());
@@ -1090,6 +1106,8 @@ class aggregation_pool {
         new aggregation_pool[max_number_gpus]};
     return pool_instances;
   }
+  static inline size_t number_devices = max_number_gpus;
+  static inline bool is_initialized = false;
   aggregation_pool() = default;
 
 public:
diff --git a/include/detail/config.hpp b/include/detail/config.hpp
index ea6e9a26..7dd8837f 100644
--- a/include/detail/config.hpp
+++ b/include/detail/config.hpp
@@ -54,10 +54,10 @@ static_assert(number_instances > 0);
 //constexpr size_t instances_per_gpu = number_instances / max_number_gpus;
 
 /// Uses HPX thread information to determine which GPU should be used
-inline size_t get_device_id(void) {
+inline size_t get_device_id(const size_t number_gpus) {
 #if defined(CPPUDDLE_HAVE_HPX) 
-    //return hpx::get_worker_thread_num() / max_num_gpus; 
-    return 0; 
+    assert(number_gpus < max_number_gpus);
+    return hpx::get_worker_thread_num() / number_gpus; 
 #else
     return 0;
 #endif

From 1026cd5c42fef625e668a8f7c74ddda08cee0da7 Mon Sep 17 00:00:00 2001
From: Gregor Daiss <Gregor.Daiss+git@gmail.com>
Date: Sat, 19 Aug 2023 14:05:10 -0500
Subject: [PATCH 26/42] Fix multigpu

---
 include/aggregation_manager.hpp | 31 +++++++++++---------
 include/buffer_manager.hpp      | 10 +++----
 include/detail/config.hpp       |  5 ++--
 include/stream_manager.hpp      | 51 +++++++++++++++++----------------
 4 files changed, 52 insertions(+), 45 deletions(-)

diff --git a/include/aggregation_manager.hpp b/include/aggregation_manager.hpp
index 4d91b014..35338bfc 100644
--- a/include/aggregation_manager.hpp
+++ b/include/aggregation_manager.hpp
@@ -563,7 +563,7 @@ template <typename Executor> class Aggregated_Executor {
         // many different buffers for different aggregation sizes on different GPUs
         /* size_t location_id = gpu_id * instances_per_gpu; */
         // Use integer conversion to only use 0 16 32 ... as buckets
-        size_t location_id = (hpx::get_worker_thread_num() / 16) * 16; 
+        size_t location_id = ((hpx::get_worker_thread_num() % number_instances) / 16) * 16; 
 #ifdef CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS
         if (max_slices == 1) {
           // get prefered location: aka the current hpx threads location
@@ -1015,23 +1015,22 @@ class aggregation_pool {
   /// interface
   template <typename... Ts>
   static void init(size_t number_of_executors, size_t slices_per_executor,
-                   Aggregated_Executor_Modes mode, std::optional<size_t> overwrite_number_devices) {
+                   Aggregated_Executor_Modes mode, size_t num_devices = 1) {
     if (is_initialized) {
       throw std::runtime_error(
           std::string("Trying to initialize cppuddle aggregation pool twice") +
-          " Kernel: " + std::string(kernelname));
+          " Agg pool name: " + std::string(kernelname));
     }
-    if (number_devices) {
-      if (*overwrite_number_devices > max_number_gpus) {
-        throw std::runtime_error(
-            std::string(
-                "Trying to initialize aggregation with more devices than the "
-                "maximum number of GPUs given at compiletime") +
-            " Kernel: " + std::string(kernelname));
-      }
-      number_devices = *overwrite_number_devices;
+    if (num_devices > max_number_gpus) {
+      throw std::runtime_error(
+          std::string(
+              "Trying to initialize aggregation with more devices than the "
+              "maximum number of GPUs given at compiletime") +
+          " Agg pool name: " + std::string(kernelname));
     }
+    number_devices = num_devices;
     for (size_t gpu_id = 0; gpu_id < number_devices; gpu_id++) {
+
       std::lock_guard<aggregation_mutex_t> guard(instance()[gpu_id].pool_mutex);
       assert(instance()[gpu_id].aggregation_executor_pool.empty());
       for (int i = 0; i < number_of_executors; i++) {
@@ -1041,10 +1040,16 @@ class aggregation_pool {
       instance()[gpu_id].slices_per_executor = slices_per_executor;
       instance()[gpu_id].mode = mode;
     }
+    is_initialized = true;
   }
 
   /// Will always return a valid executor slice
   static decltype(auto) request_executor_slice(void) {
+    if (!is_initialized) {
+      throw std::runtime_error(
+          std::string("Trying to use cppuddle aggregation pool without first calling init") +
+          " Agg poolname: " + std::string(kernelname));
+    }
     const size_t gpu_id = get_device_id(number_devices);
     /* const size_t gpu_id = 1; */
     std::lock_guard<aggregation_mutex_t> guard(instance()[gpu_id].pool_mutex);
@@ -1106,7 +1111,7 @@ class aggregation_pool {
         new aggregation_pool[max_number_gpus]};
     return pool_instances;
   }
-  static inline size_t number_devices = max_number_gpus;
+  static inline size_t number_devices = 1;
   static inline bool is_initialized = false;
   aggregation_pool() = default;
 
diff --git a/include/buffer_manager.hpp b/include/buffer_manager.hpp
index ab4b5144..ea04160b 100644
--- a/include/buffer_manager.hpp
+++ b/include/buffer_manager.hpp
@@ -771,9 +771,9 @@ template <typename T, typename Host_Allocator> struct recycle_allocator {
   }
 #else
   recycle_allocator() noexcept
-      : dealloc_hint(hpx::get_worker_thread_num()), device_id(0) {}
+      : dealloc_hint(hpx::get_worker_thread_num() % number_instances), device_id(0) {}
   explicit recycle_allocator(const size_t device_id) noexcept
-      : dealloc_hint(hpx::get_worker_thread_num()), device_id(device_id) {}
+      : dealloc_hint(hpx::get_worker_thread_num() % number_instances), device_id(device_id) {}
   explicit recycle_allocator(const size_t device_i, const size_t location_id) noexcept
       : dealloc_hint(location_id), device_id(device_id) {}
   explicit recycle_allocator(
@@ -781,7 +781,7 @@ template <typename T, typename Host_Allocator> struct recycle_allocator {
   : dealloc_hint(other.dealloc_hint), device_id(other.device_id) {}
   T *allocate(std::size_t n) {
     T *data = buffer_recycler::get<T, Host_Allocator>(
-        n, false, hpx::get_worker_thread_num(), device_id);
+        n, false, hpx::get_worker_thread_num() % number_instances, device_id);
     return data;
   }
   void deallocate(T *p, std::size_t n) {
@@ -842,9 +842,9 @@ struct aggressive_recycle_allocator {
   }
 #else
   aggressive_recycle_allocator() noexcept
-      : dealloc_hint(hpx::get_worker_thread_num()), device_id(0) {}
+      : dealloc_hint(hpx::get_worker_thread_num() % number_instances), device_id(0) {}
   explicit aggressive_recycle_allocator(const size_t device_id) noexcept
-      : dealloc_hint(hpx::get_worker_thread_num()), device_id(device_id) {}
+      : dealloc_hint(hpx::get_worker_thread_num() % number_instances), device_id(device_id) {}
   explicit aggressive_recycle_allocator(const size_t device_id, const size_t location_id) noexcept
       : dealloc_hint(location_id), device_id(device_id) {}
   explicit aggressive_recycle_allocator(
diff --git a/include/detail/config.hpp b/include/detail/config.hpp
index 7dd8837f..9b425f8b 100644
--- a/include/detail/config.hpp
+++ b/include/detail/config.hpp
@@ -48,16 +48,15 @@ constexpr size_t max_number_gpus = CPPUDDLE_MAX_NUMBER_GPUS;
 #ifndef CPPUDDLE_HAVE_HPX
 static_assert(max_number_gpus == 1, "Non HPX builds do not support multigpu");
 #endif
-static_assert(number_instances >= max_number_gpus);
+//static_assert(number_instances >= max_number_gpus);
 static_assert(max_number_gpus > 0);
-static_assert(number_instances > 0);
 //constexpr size_t instances_per_gpu = number_instances / max_number_gpus;
 
 /// Uses HPX thread information to determine which GPU should be used
 inline size_t get_device_id(const size_t number_gpus) {
 #if defined(CPPUDDLE_HAVE_HPX) 
     assert(number_gpus < max_number_gpus);
-    return hpx::get_worker_thread_num() / number_gpus; 
+    return hpx::get_worker_thread_num() % number_gpus; 
 #else
     return 0;
 #endif
diff --git a/include/stream_manager.hpp b/include/stream_manager.hpp
index a10229a2..8be13b32 100644
--- a/include/stream_manager.hpp
+++ b/include/stream_manager.hpp
@@ -60,9 +60,10 @@ template <class Interface> class round_robin_pool {
     return *(
         std::min_element(std::begin(ref_counters), std::end(ref_counters)));
   }
-  size_t get_next_device_id() {
-    return 0; // single gpu pool
-  }
+  // TODO Remove
+  /* size_t get_next_device_id() { */
+  /*   return 0; // single gpu pool */
+  /* } */
 };
 
 template <class Interface> class priority_pool {
@@ -103,9 +104,10 @@ template <class Interface> class priority_pool {
     return ref_counters[priorities[0]] < load_limit;
   }
   size_t get_current_load() { return ref_counters[priorities[0]]; }
-  size_t get_next_device_id() {
-    return 0; // single gpu pool
-  }
+  // TODO remove
+  /* size_t get_next_device_id() { */
+  /*   return 0; // single gpu pool */
+  /* } */
 };
 
 /// Access/Concurrency Control for stream pool implementation
@@ -130,28 +132,28 @@ class stream_pool {
     stream_pool_implementation<Interface, Pool>::cleanup();
   }
   template <class Interface, class Pool>
-  static std::tuple<Interface &, size_t> get_interface(const size_t gpu_id = get_device_id()) {
+  static std::tuple<Interface &, size_t> get_interface(const size_t gpu_id) {
     return stream_pool_implementation<Interface, Pool>::get_interface(gpu_id);
   }
   template <class Interface, class Pool>
-  static void release_interface(size_t index, const size_t gpu_id = get_device_id()) noexcept {
+  static void release_interface(size_t index, const size_t gpu_id) noexcept {
     stream_pool_implementation<Interface, Pool>::release_interface(index,
         gpu_id);
   }
   template <class Interface, class Pool>
-  static bool interface_available(size_t load_limit) noexcept {
+  static bool interface_available(size_t load_limit, const size_t gpu_id) noexcept {
     return stream_pool_implementation<Interface, Pool>::interface_available(
-        load_limit, get_device_id());
+        load_limit, gpu_id);
   }
   template <class Interface, class Pool>
-  static size_t get_current_load() noexcept {
+  static size_t get_current_load(const size_t gpu_id = 0) noexcept {
     return stream_pool_implementation<Interface, Pool>::get_current_load(
-        get_device_id());
+        gpu_id);
   }
-  // TODO deprecated! Remove...
   template <class Interface, class Pool>
-  static size_t get_next_device_id() noexcept {
-    return stream_pool_implementation<Interface, Pool>::get_next_device_id(get_device_id());
+  static size_t get_next_device_id(const size_t number_gpus) noexcept {
+    // TODO add round robin and min strategy
+    return get_device_id(number_gpus);
   }
 
   template <class Interface, class Pool>
@@ -232,11 +234,11 @@ class stream_pool {
       return instance().streampools[gpu_id].get_current_load();
     }
     // TODO deprecated! Remove...
-    static size_t get_next_device_id(const size_t gpu_id = 0) {
-      std::lock_guard<mutex_t> guard(instance().gpu_mutexes[gpu_id]);
-      assert(instance().streampools.size() == max_number_gpus);
-      return instance().streampools[gpu_id].get_next_device_id();
-    }
+    /* static size_t get_next_device_id(const size_t gpu_id = 0) { */
+    /*   std::lock_guard<mutex_t> guard(instance().gpu_mutexes[gpu_id]); */
+    /*   assert(instance().streampools.size() == max_number_gpus); */
+    /*   return instance().streampools[gpu_id].get_next_device_id(); */
+    /* } */
 
     static void set_device_selector(std::function<void(size_t)> select_gpu_function) {
       auto guard = make_scoped_lock_from_array(instance().gpu_mutexes);
@@ -288,16 +290,16 @@ class stream_pool {
 
 template <class Interface, class Pool> class stream_interface {
 public:
-  stream_interface()
-      : t(stream_pool::get_interface<Interface, Pool>()),
-        interface(std::get<0>(t)), interface_index(std::get<1>(t)) {}
+  explicit stream_interface(size_t gpu_id)
+      : t(stream_pool::get_interface<Interface, Pool>(gpu_id)),
+        interface(std::get<0>(t)), interface_index(std::get<1>(t)), gpu_id(gpu_id) {}
 
   stream_interface(const stream_interface &other) = delete;
   stream_interface &operator=(const stream_interface &other) = delete;
   stream_interface(stream_interface &&other) = delete;
   stream_interface &operator=(stream_interface &&other) = delete;
   ~stream_interface() {
-    stream_pool::release_interface<Interface, Pool>(interface_index);
+    stream_pool::release_interface<Interface, Pool>(interface_index, gpu_id);
   }
 
   template <typename F, typename... Ts>
@@ -318,6 +320,7 @@ template <class Interface, class Pool> class stream_interface {
 private:
   std::tuple<Interface &, size_t> t;
   size_t interface_index;
+  size_t gpu_id;
 
 public:
   Interface &interface;

From 95c77224f1ec08778fecd6d4456b1ae0cf96083c Mon Sep 17 00:00:00 2001
From: Gregor Daiss <Gregor.Daiss+git@gmail.com>
Date: Mon, 21 Aug 2023 01:05:35 -0500
Subject: [PATCH 27/42] Adapt hip_device_buffers for multigpu

---
 include/cuda_buffer_util.hpp |  2 +-
 include/hip_buffer_util.hpp  | 39 ++++++++++--------------------------
 2 files changed, 12 insertions(+), 29 deletions(-)

diff --git a/include/cuda_buffer_util.hpp b/include/cuda_buffer_util.hpp
index e454d8c0..55d3397a 100644
--- a/include/cuda_buffer_util.hpp
+++ b/include/cuda_buffer_util.hpp
@@ -136,7 +136,7 @@ template <typename T, typename Host_Allocator, std::enable_if_t<std::is_trivial<
 struct cuda_aggregated_device_buffer {
   T *device_side_buffer;
   size_t number_of_elements;
-  explicit cuda_aggregated_device_buffer(size_t number_of_elements, Host_Allocator &alloc)
+  cuda_aggregated_device_buffer(size_t number_of_elements, Host_Allocator &alloc)
       : number_of_elements(number_of_elements), alloc(alloc) {
     device_side_buffer =
         alloc.allocate(number_of_elements);
diff --git a/include/hip_buffer_util.hpp b/include/hip_buffer_util.hpp
index c33566e5..7942792b 100644
--- a/include/hip_buffer_util.hpp
+++ b/include/hip_buffer_util.hpp
@@ -22,7 +22,6 @@ template <class T> struct hip_pinned_allocator {
   template <class U>
   explicit hip_pinned_allocator(hip_pinned_allocator<U> const &) noexcept {}
   T *allocate(std::size_t n) {
-    hipSetDevice(get_device_id());
     T *data;
     // hipError_t error =
     //     hipMallocHost(reinterpret_cast<void **>(&data), n * sizeof(T));
@@ -69,7 +68,6 @@ template <class T> struct hip_device_allocator {
   template <class U>
   explicit hip_device_allocator(hip_device_allocator<U> const &) noexcept {}
   T *allocate(std::size_t n) {
-    hipSetDevice(get_device_id());
     T *data;
     hipError_t error = hipMalloc(&data, n * sizeof(T));
     if (error != hipSuccess) {
@@ -115,22 +113,18 @@ using recycle_allocator_hip_device =
 // TODO Is this even required? (cuda version should work fine...)
 template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
 struct hip_device_buffer {
-  size_t gpu_id{0};
+  recycle_allocator_hip_device<T> allocator;
   T *device_side_buffer;
   size_t number_of_elements;
-  explicit hip_device_buffer(size_t number_of_elements)
-      : number_of_elements(number_of_elements) {
-    device_side_buffer =
-        recycle_allocator_hip_device<T>{}.allocate(number_of_elements);
-  }
-  explicit hip_device_buffer(size_t number_of_elements, size_t gpu_id)
-      : gpu_id(gpu_id), number_of_elements(number_of_elements), set_id(true) {
+
+  hip_device_buffer(size_t number_of_elements, size_t device_id)
+      : allocator{device_id}, number_of_elements(number_of_elements) {
+    assert(device_id < max_number_gpus);
     device_side_buffer =
         recycle_allocator_hip_device<T>{}.allocate(number_of_elements);
   }
   ~hip_device_buffer() {
-    recycle_allocator_hip_device<T>{}.deallocate(device_side_buffer,
-                                                  number_of_elements);
+    allocator.deallocate(device_side_buffer, number_of_elements);
   }
   // not yet implemented
   hip_device_buffer(hip_device_buffer const &other) = delete;
@@ -138,30 +132,19 @@ struct hip_device_buffer {
   hip_device_buffer(hip_device_buffer const &&other) = delete;
   hip_device_buffer operator=(hip_device_buffer const &&other) = delete;
 
-private:
-  bool set_id{false};
 };
 
 template <typename T, typename Host_Allocator, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
 struct hip_aggregated_device_buffer {
-  size_t gpu_id{0};
   T *device_side_buffer;
   size_t number_of_elements;
-  explicit hip_aggregated_device_buffer(size_t number_of_elements)
-      : number_of_elements(number_of_elements) {
-    device_side_buffer =
-        recycle_allocator_hip_device<T>{}.allocate(number_of_elements);
-  }
-  explicit hip_aggregated_device_buffer(size_t number_of_elements, size_t gpu_id, Host_Allocator &alloc)
-      : gpu_id(gpu_id), number_of_elements(number_of_elements), set_id(true), alloc(alloc) {
-    assert(gpu_id == 0); 
+  hip_aggregated_device_buffer(size_t number_of_elements, Host_Allocator &alloc)
+      : number_of_elements(number_of_elements), alloc(alloc) {
     device_side_buffer =
         alloc.allocate(number_of_elements);
   }
   ~hip_aggregated_device_buffer() {
-    assert(gpu_id == 0); 
-    alloc.deallocate(device_side_buffer,
-                                                  number_of_elements);
+    alloc.deallocate(device_side_buffer, number_of_elements);
   }
   // not yet implemented
   hip_aggregated_device_buffer(hip_aggregated_device_buffer const &other) = delete;
@@ -170,8 +153,8 @@ struct hip_aggregated_device_buffer {
   hip_aggregated_device_buffer operator=(hip_aggregated_device_buffer const &&other) = delete;
 
 private:
-  bool set_id{false};
-  Host_Allocator &alloc;
+  Host_Allocator &alloc; // will stay valid for the entire aggregation region and hence
+                         // for the entire lifetime of this buffer
 };
 
 } // end namespace recycler

From 07f3c4187b5057729848e0c370c4f72912601553 Mon Sep 17 00:00:00 2001
From: Gregor Daiss <Gregor.Daiss+git@gmail.com>
Date: Mon, 21 Aug 2023 11:37:22 -0500
Subject: [PATCH 28/42] Fix missing hip device selectors

---
 include/hip_buffer_util.hpp | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/include/hip_buffer_util.hpp b/include/hip_buffer_util.hpp
index 7942792b..adc846d0 100644
--- a/include/hip_buffer_util.hpp
+++ b/include/hip_buffer_util.hpp
@@ -157,5 +157,16 @@ struct hip_aggregated_device_buffer {
                          // for the entire lifetime of this buffer
 };
 
+namespace device_selection {
+template <typename T>
+struct select_device_functor<T, detail::hip_pinned_allocator<T>> {
+  void operator()(const size_t device_id) { hipSetDevice(device_id); }
+};
+template <typename T>
+struct select_device_functor<T, detail::hip_device_allocator<T>> {
+  void operator()(const size_t device_id) { hipSetDevice(device_id); }
+};
+} // namespace device_selection
+
 } // end namespace recycler
 #endif

From 8b4d15b5b158ad26c3de2346c70687dac09db1bd Mon Sep 17 00:00:00 2001
From: Gregor Daiss <Gregor.Daiss+git@gmail.com>
Date: Mon, 21 Aug 2023 11:37:37 -0500
Subject: [PATCH 29/42] Add exception warnings

Should not impact non-exception performance due to zero cost abstracions
in gcc/clang
---
 include/buffer_manager.hpp | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/include/buffer_manager.hpp b/include/buffer_manager.hpp
index ea04160b..32e751d9 100644
--- a/include/buffer_manager.hpp
+++ b/include/buffer_manager.hpp
@@ -93,16 +93,28 @@ For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCL
   static T *get(size_t number_elements, bool manage_content_lifetime = false,
       std::optional<size_t> location_hint = std::nullopt, 
       std::optional<size_t> device_id = std::nullopt) {
-    return buffer_manager<T, Host_Allocator>::get(
-        number_elements, manage_content_lifetime, location_hint, device_id);
+    try {
+      return buffer_manager<T, Host_Allocator>::get(
+          number_elements, manage_content_lifetime, location_hint, device_id);
+    } catch (const std::exception &exc) {
+      std::cerr << "ERROR: Encountered unhandled exception in cppuddle get: " << exc.what() << std::endl;
+      std::cerr << "Rethrowing exception... " << std::endl;;
+      throw;
+    }
   }
   /// Marks an buffer as unused and fit for reusage
   template <typename T, typename Host_Allocator>
   static void mark_unused(T *p, size_t number_elements,
       std::optional<size_t> location_hint = std::nullopt, 
       std::optional<size_t> device_id = std::nullopt) {
-    return buffer_manager<T, Host_Allocator>::mark_unused(p, number_elements,
-        location_hint, device_id);
+    try {
+      return buffer_manager<T, Host_Allocator>::mark_unused(p, number_elements,
+          location_hint, device_id);
+    } catch (const std::exception &exc) {
+      std::cerr << "ERROR: Encountered unhandled exception in cppuddle mark_unused: " << exc.what() << std::endl;
+      std::cerr << "Rethrowing exception... " << std::endl;;
+      throw;
+    }
   }
 #endif
   template <typename T, typename Host_Allocator>

From 9fa7e440cf1a2e1270e6af9f9333def6bc436ec9 Mon Sep 17 00:00:00 2001
From: Gregor Daiss <Gregor.Daiss+git@gmail.com>
Date: Mon, 21 Aug 2023 17:21:55 -0500
Subject: [PATCH 30/42] Fix hip device side buffer

---
 include/hip_buffer_util.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/hip_buffer_util.hpp b/include/hip_buffer_util.hpp
index adc846d0..e2364095 100644
--- a/include/hip_buffer_util.hpp
+++ b/include/hip_buffer_util.hpp
@@ -121,7 +121,7 @@ struct hip_device_buffer {
       : allocator{device_id}, number_of_elements(number_of_elements) {
     assert(device_id < max_number_gpus);
     device_side_buffer =
-        recycle_allocator_hip_device<T>{}.allocate(number_of_elements);
+        allocator.allocate(number_of_elements);
   }
   ~hip_device_buffer() {
     allocator.deallocate(device_side_buffer, number_of_elements);

From 4ba470aa5ac72148c31d1e2d4ecae32196d18b5b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gregor=20Dai=C3=9F?= <Gregor.Daiss+git@gmail.com>
Date: Tue, 22 Aug 2023 14:35:28 -0500
Subject: [PATCH 31/42] Get stuff out of the global namespace

---
 include/aggregation_manager.hpp | 58 ++++++++++++++++-----------------
 include/detail/config.hpp       | 17 +++++++---
 include/stream_manager.hpp      | 36 ++++++++++----------
 3 files changed, 60 insertions(+), 51 deletions(-)

diff --git a/include/aggregation_manager.hpp b/include/aggregation_manager.hpp
index 35338bfc..c297056c 100644
--- a/include/aggregation_manager.hpp
+++ b/include/aggregation_manager.hpp
@@ -144,7 +144,7 @@ template <typename Executor> class aggregated_function_call {
   std::any function_tuple;
   /// Stores the string of the first function call for debug output
   std::string debug_type_information;
-  aggregation_mutex_t debug_mut;
+  recycler::aggregation_mutex_t debug_mut;
 #endif
 
   std::vector<hpx::lcos::local::promise<void>> potential_async_promises{};
@@ -175,7 +175,7 @@ template <typename Executor> class aggregated_function_call {
 #if !(defined(NDEBUG)) && defined(DEBUG_AGGREGATION_CALLS)
     // needed for concurrent access to function_tuple and debug_type_information
     // Not required for normal use
-    std::lock_guard<aggregation_mutex_t> guard(debug_mut);
+    std::lock_guard<recycler::aggregation_mutex_t> guard(debug_mut);
 #endif
     assert(!async_mode);
     assert(potential_async_promises.empty());
@@ -249,7 +249,7 @@ template <typename Executor> class aggregated_function_call {
 #if !(defined(NDEBUG)) && defined(DEBUG_AGGREGATION_CALLS)
     // needed for concurrent access to function_tuple and debug_type_information
     // Not required for normal use
-    std::lock_guard<aggregation_mutex_t> guard(debug_mut);
+    std::lock_guard<recycler::aggregation_mutex_t> guard(debug_mut);
 #endif
     assert(async_mode);
     assert(!potential_async_promises.empty());
@@ -530,7 +530,7 @@ template <typename Executor> class Aggregated_Executor {
   /// slices have called it
   std::deque<aggregated_function_call<Executor>> function_calls;
   /// For synchronizing the access to the function calls list
-  aggregation_mutex_t mut;
+  recycler::aggregation_mutex_t mut;
 
   /// Data entry for a buffer allocation: void* pointer, size_t for
   /// buffer-size, atomic for the slice counter, location_id, gpu_id
@@ -541,7 +541,7 @@ template <typename Executor> class Aggregated_Executor {
   /// Map pointer to deque index for fast access in the deallocations
   std::unordered_map<void*,size_t> buffer_allocations_map;
   /// For synchronizing the access to the buffer_allocations
-  aggregation_mutex_t buffer_mut;
+  recycler::aggregation_mutex_t buffer_mut;
   std::atomic<size_t> buffer_counter = 0;
 
   /// Get new buffer OR get buffer already allocated by different slice
@@ -553,7 +553,7 @@ template <typename Executor> class Aggregated_Executor {
     // First: Check if it already has happened
     if (buffer_counter <= slice_alloc_counter) {
       // we might be the first! Lock...
-      std::lock_guard<aggregation_mutex_t> guard(buffer_mut);
+      std::lock_guard<recycler::aggregation_mutex_t> guard(buffer_mut);
       // ... and recheck
       if (buffer_counter <= slice_alloc_counter) {
         constexpr bool manage_content_lifetime = false;
@@ -563,7 +563,7 @@ template <typename Executor> class Aggregated_Executor {
         // many different buffers for different aggregation sizes on different GPUs
         /* size_t location_id = gpu_id * instances_per_gpu; */
         // Use integer conversion to only use 0 16 32 ... as buckets
-        size_t location_id = ((hpx::get_worker_thread_num() % number_instances) / 16) * 16; 
+        size_t location_id = ((hpx::get_worker_thread_num() % recycler::number_instances) / 16) * 16; 
 #ifdef CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS
         if (max_slices == 1) {
           // get prefered location: aka the current hpx threads location
@@ -648,7 +648,7 @@ template <typename Executor> class Aggregated_Executor {
     // Check if all slices are done with this buffer?
     if (buffer_allocation_counter == 0) {
       // Yes! "Deallocate" by telling the recylcer the buffer is fit for reusage
-      std::lock_guard<aggregation_mutex_t> guard(buffer_mut);
+      std::lock_guard<recycler::aggregation_mutex_t> guard(buffer_mut);
       // Only mark unused if another buffer has not done so already (and marked
       // it as invalid)
       if (valid) {
@@ -661,7 +661,7 @@ template <typename Executor> class Aggregated_Executor {
 
         const size_t current_deallocs = ++dealloc_counter;
         if (current_deallocs == buffer_counter) {
-          std::lock_guard<aggregation_mutex_t> guard(mut);
+          std::lock_guard<recycler::aggregation_mutex_t> guard(mut);
           buffers_in_use = false;
           if (!executor_slices_alive && !buffers_in_use)
             slices_exhausted = false;
@@ -679,11 +679,11 @@ template <typename Executor> class Aggregated_Executor {
 
   /// Only meant to be accessed by the slice executors
   bool sync_aggregation_slices(const size_t slice_launch_counter) {
-    std::lock_guard<aggregation_mutex_t> guard(mut);
+    std::lock_guard<recycler::aggregation_mutex_t> guard(mut);
     assert(slices_exhausted == true);
     // Add function call object in case it hasn't happened for this launch yet
     if (overall_launch_counter <= slice_launch_counter) {
-      /* std::lock_guard<aggregation_mutex_t> guard(mut); */
+      /* std::lock_guard<recycler::aggregation_mutex_t> guard(mut); */
       if (overall_launch_counter <= slice_launch_counter) {
         function_calls.emplace_back(current_slices, false, executor);
         overall_launch_counter = function_calls.size();
@@ -699,11 +699,11 @@ template <typename Executor> class Aggregated_Executor {
   /// Only meant to be accessed by the slice executors
   template <typename F, typename... Ts>
   void post(const size_t slice_launch_counter, F &&f, Ts &&...ts) {
-    std::lock_guard<aggregation_mutex_t> guard(mut);
+    std::lock_guard<recycler::aggregation_mutex_t> guard(mut);
     assert(slices_exhausted == true);
     // Add function call object in case it hasn't happened for this launch yet
     if (overall_launch_counter <= slice_launch_counter) {
-      /* std::lock_guard<aggregation_mutex_t> guard(mut); */
+      /* std::lock_guard<recycler::aggregation_mutex_t> guard(mut); */
       if (overall_launch_counter <= slice_launch_counter) {
         function_calls.emplace_back(current_slices, false, executor);
         overall_launch_counter = function_calls.size();
@@ -722,11 +722,11 @@ template <typename Executor> class Aggregated_Executor {
   template <typename F, typename... Ts>
   hpx::lcos::future<void> async(const size_t slice_launch_counter, F &&f,
                                 Ts &&...ts) {
-    std::lock_guard<aggregation_mutex_t> guard(mut);
+    std::lock_guard<recycler::aggregation_mutex_t> guard(mut);
     assert(slices_exhausted == true);
     // Add function call object in case it hasn't happened for this launch yet
     if (overall_launch_counter <= slice_launch_counter) {
-      /* std::lock_guard<aggregation_mutex_t> guard(mut); */
+      /* std::lock_guard<recycler::aggregation_mutex_t> guard(mut); */
       if (overall_launch_counter <= slice_launch_counter) {
         function_calls.emplace_back(current_slices, true, executor);
         overall_launch_counter = function_calls.size();
@@ -742,11 +742,11 @@ template <typename Executor> class Aggregated_Executor {
   template <typename F, typename... Ts>
   hpx::lcos::shared_future<void> wrap_async(const size_t slice_launch_counter, F &&f,
                                 Ts &&...ts) {
-    std::lock_guard<aggregation_mutex_t> guard(mut);
+    std::lock_guard<recycler::aggregation_mutex_t> guard(mut);
     assert(slices_exhausted == true);
     // Add function call object in case it hasn't happened for this launch yet
     if (overall_launch_counter <= slice_launch_counter) {
-      /* std::lock_guard<aggregation_mutex_t> guard(mut); */
+      /* std::lock_guard<recycler::aggregation_mutex_t> guard(mut); */
       if (overall_launch_counter <= slice_launch_counter) {
         function_calls.emplace_back(current_slices, true, executor);
         overall_launch_counter = function_calls.size();
@@ -760,12 +760,12 @@ template <typename Executor> class Aggregated_Executor {
   }
 
   bool slice_available(void) {
-    std::lock_guard<aggregation_mutex_t> guard(mut);
+    std::lock_guard<recycler::aggregation_mutex_t> guard(mut);
     return !slices_exhausted;
   }
 
   std::optional<hpx::lcos::future<Executor_Slice>> request_executor_slice() {
-    std::lock_guard<aggregation_mutex_t> guard(mut);
+    std::lock_guard<recycler::aggregation_mutex_t> guard(mut);
     if (!slices_exhausted) {
       const size_t local_slice_id = ++current_slices;
       if (local_slice_id == 1) {
@@ -773,7 +773,7 @@ template <typename Executor> class Aggregated_Executor {
         // TODO still required? Should be clean here already
         function_calls.clear();
         overall_launch_counter = 0;
-        std::lock_guard<aggregation_mutex_t> guard(buffer_mut);
+        std::lock_guard<recycler::aggregation_mutex_t> guard(buffer_mut);
 #ifndef NDEBUG
         for (const auto &buffer_entry : buffer_allocations) {
           const auto &[buffer_pointer_any, buffer_size,
@@ -831,7 +831,7 @@ template <typename Executor> class Aggregated_Executor {
         }
         // Launch all executor slices within this continuation
         current_continuation = fut.then([this](auto &&fut) {
-          std::lock_guard<aggregation_mutex_t> guard(mut);
+          std::lock_guard<recycler::aggregation_mutex_t> guard(mut);
           slices_exhausted = true;
           launched_slices = current_slices;
           size_t id = 0;
@@ -868,7 +868,7 @@ template <typename Executor> class Aggregated_Executor {
   }
   size_t launched_slices;
   void reduce_usage_counter(void) {
-    /* std::lock_guard<aggregation_mutex_t> guard(mut); */
+    /* std::lock_guard<recycler::aggregation_mutex_t> guard(mut); */
     assert(slices_exhausted == true);
     assert(executor_slices_alive == true);
     assert(launched_slices >= 1);
@@ -885,7 +885,7 @@ template <typename Executor> class Aggregated_Executor {
       // std::get<0>(executor_tuple); 
       // Mark executor fit for reusage
 
-      std::lock_guard<aggregation_mutex_t> guard(mut);
+      std::lock_guard<recycler::aggregation_mutex_t> guard(mut);
       executor_slices_alive = false; 
       if (!executor_slices_alive && !buffers_in_use) {
         slices_exhausted = false;
@@ -1021,7 +1021,7 @@ class aggregation_pool {
           std::string("Trying to initialize cppuddle aggregation pool twice") +
           " Agg pool name: " + std::string(kernelname));
     }
-    if (num_devices > max_number_gpus) {
+    if (num_devices > recycler::max_number_gpus) {
       throw std::runtime_error(
           std::string(
               "Trying to initialize aggregation with more devices than the "
@@ -1031,7 +1031,7 @@ class aggregation_pool {
     number_devices = num_devices;
     for (size_t gpu_id = 0; gpu_id < number_devices; gpu_id++) {
 
-      std::lock_guard<aggregation_mutex_t> guard(instance()[gpu_id].pool_mutex);
+      std::lock_guard<recycler::aggregation_mutex_t> guard(instance()[gpu_id].pool_mutex);
       assert(instance()[gpu_id].aggregation_executor_pool.empty());
       for (int i = 0; i < number_of_executors; i++) {
         instance()[gpu_id].aggregation_executor_pool.emplace_back(slices_per_executor,
@@ -1050,9 +1050,9 @@ class aggregation_pool {
           std::string("Trying to use cppuddle aggregation pool without first calling init") +
           " Agg poolname: " + std::string(kernelname));
     }
-    const size_t gpu_id = get_device_id(number_devices);
+    const size_t gpu_id = recycler::get_device_id(number_devices);
     /* const size_t gpu_id = 1; */
-    std::lock_guard<aggregation_mutex_t> guard(instance()[gpu_id].pool_mutex);
+    std::lock_guard<recycler::aggregation_mutex_t> guard(instance()[gpu_id].pool_mutex);
     assert(!instance()[gpu_id].aggregation_executor_pool.empty());
     std::optional<hpx::lcos::future<
         typename Aggregated_Executor<Interface>::Executor_Slice>>
@@ -1104,11 +1104,11 @@ class aggregation_pool {
 private:
   /// Required for dealing with adding elements to the deque of
   /// aggregated_executors
-  aggregation_mutex_t pool_mutex;
+  recycler::aggregation_mutex_t pool_mutex;
   /// Global access instance
   static std::unique_ptr<aggregation_pool[]>& instance(void) {
     static std::unique_ptr<aggregation_pool[]> pool_instances{
-        new aggregation_pool[max_number_gpus]};
+        new aggregation_pool[recycler::max_number_gpus]};
     return pool_instances;
   }
   static inline size_t number_devices = 1;
diff --git a/include/detail/config.hpp b/include/detail/config.hpp
index 9b425f8b..e118e884 100644
--- a/include/detail/config.hpp
+++ b/include/detail/config.hpp
@@ -6,16 +6,13 @@
 #ifndef CPPUDDLE_CONFIG_HPP
 #define CPPUDDLE_CONFIG_HPP
 
+
 // Mutex configuration
 //
 #if defined(CPPUDDLE_HAVE_HPX) && defined(CPPUDDLE_HAVE_HPX_MUTEX)
 #include <hpx/mutex.hpp>
-using mutex_t = hpx::spinlock_no_backoff;
-using aggregation_mutex_t = hpx::mutex;
 #else
 #include <mutex>
-using mutex_t = std::mutex;
-using aggregation_mutex_t = std::mutex;
 #endif
 
 // HPX-aware configuration
@@ -31,6 +28,16 @@ For better performance configure CPPuddle with CPPUDDLE_WITH_HPX_AWARE_ALLOCATOR
 #endif
 #endif
 
+namespace recycler {
+
+#if defined(CPPUDDLE_HAVE_HPX) && defined(CPPUDDLE_HAVE_HPX_MUTEX)
+using mutex_t = hpx::spinlock_no_backoff;
+using aggregation_mutex_t = hpx::mutex;
+#else
+using mutex_t = std::mutex;
+using aggregation_mutex_t = std::mutex;
+#endif
+
 // Recycling configuration
 // TODO Add warnings here
 
@@ -62,4 +69,6 @@ inline size_t get_device_id(const size_t number_gpus) {
 #endif
 }
 
+} // end namespace recycler
+
 #endif
diff --git a/include/stream_manager.hpp b/include/stream_manager.hpp
index 8be13b32..4174af5c 100644
--- a/include/stream_manager.hpp
+++ b/include/stream_manager.hpp
@@ -153,7 +153,7 @@ class stream_pool {
   template <class Interface, class Pool>
   static size_t get_next_device_id(const size_t number_gpus) noexcept {
     // TODO add round robin and min strategy
-    return get_device_id(number_gpus);
+    return recycler::get_device_id(number_gpus);
   }
 
   template <class Interface, class Pool>
@@ -175,7 +175,7 @@ class stream_pool {
     /// Deprecated! Use init_on_all_gpu or init_on_gpu
     template <typename... Ts>
     static void init(size_t number_of_streams, Ts ... executor_args) {
-      /* static_assert(sizeof...(Ts) == sizeof...(Ts) && max_number_gpus == 1, */
+      /* static_assert(sizeof...(Ts) == sizeof...(Ts) && recycler::max_number_gpus == 1, */
       /*               "deprecated stream_pool::init does not support multigpu"); */
       auto guard = make_scoped_lock_from_array(instance().gpu_mutexes);
       instance().streampools.emplace_back(number_of_streams, executor_args...);
@@ -186,7 +186,7 @@ class stream_pool {
     static void init_all_executor_pools(size_t number_of_streams, Ts ... executor_args) {
       auto guard = make_scoped_lock_from_array(instance().gpu_mutexes);
       if (number_of_streams > 0) {
-        for (size_t gpu_id = 0; gpu_id < max_number_gpus; gpu_id++) {
+        for (size_t gpu_id = 0; gpu_id < recycler::max_number_gpus; gpu_id++) {
           instance().select_gpu_function(gpu_id);
           instance().streampools.emplace_back(number_of_streams,
                                               executor_args...);
@@ -209,40 +209,40 @@ class stream_pool {
     // TODO add/rename into finalize?
     static void cleanup() {
       auto guard = make_scoped_lock_from_array(instance().gpu_mutexes);
-      assert(instance().streampools.size() == max_number_gpus);
+      assert(instance().streampools.size() == recycler::max_number_gpus);
       instance().streampools.clear();
     }
 
     static std::tuple<Interface &, size_t> get_interface(const size_t gpu_id = 0) {
-      std::lock_guard<mutex_t> guard(instance().gpu_mutexes[gpu_id]);
-      assert(instance().streampools.size() == max_number_gpus);
+      std::lock_guard<recycler::mutex_t> guard(instance().gpu_mutexes[gpu_id]);
+      assert(instance().streampools.size() == recycler::max_number_gpus);
       return instance().streampools[gpu_id].get_interface();
     }
     static void release_interface(size_t index, const size_t gpu_id = 0) {
-      std::lock_guard<mutex_t> guard(instance().gpu_mutexes[gpu_id]);
-      assert(instance().streampools.size() == max_number_gpus);
+      std::lock_guard<recycler::mutex_t> guard(instance().gpu_mutexes[gpu_id]);
+      assert(instance().streampools.size() == recycler::max_number_gpus);
       instance().streampools[gpu_id].release_interface(index);
     }
     static bool interface_available(size_t load_limit, const size_t gpu_id = 0) {
-      std::lock_guard<mutex_t> guard(instance().gpu_mutexes[gpu_id]);
-      assert(instance().streampools.size() == max_number_gpus);
+      std::lock_guard<recycler::mutex_t> guard(instance().gpu_mutexes[gpu_id]);
+      assert(instance().streampools.size() == recycler::max_number_gpus);
       return instance().streampools[gpu_id].interface_available(load_limit);
     }
     static size_t get_current_load(const size_t gpu_id = 0) {
-      std::lock_guard<mutex_t> guard(instance().gpu_mutexes[gpu_id]);
-      assert(instance().streampools.size() == max_number_gpus);
+      std::lock_guard<recycler::mutex_t> guard(instance().gpu_mutexes[gpu_id]);
+      assert(instance().streampools.size() == recycler::max_number_gpus);
       return instance().streampools[gpu_id].get_current_load();
     }
     // TODO deprecated! Remove...
     /* static size_t get_next_device_id(const size_t gpu_id = 0) { */
-    /*   std::lock_guard<mutex_t> guard(instance().gpu_mutexes[gpu_id]); */
-    /*   assert(instance().streampools.size() == max_number_gpus); */
+    /*   std::lock_guard<recycler::mutex_t> guard(instance().gpu_mutexes[gpu_id]); */
+    /*   assert(instance().streampools.size() == recycler::max_number_gpus); */
     /*   return instance().streampools[gpu_id].get_next_device_id(); */
     /* } */
 
     static void set_device_selector(std::function<void(size_t)> select_gpu_function) {
       auto guard = make_scoped_lock_from_array(instance().gpu_mutexes);
-      assert(instance().streampools.size() == max_number_gpus);
+      assert(instance().streampools.size() == recycler::max_number_gpus);
       instance().select_gpu_function = select_gpu_function;
     }
 
@@ -252,15 +252,15 @@ class stream_pool {
 
   private:
     stream_pool_implementation() = default;
-    mutex_t pool_mut{};
+    recycler::mutex_t pool_mut{};
     std::function<void(size_t)> select_gpu_function = [](size_t gpu_id) {
       // By default no multi gpu support
-      assert(max_number_gpus == 1);
+      assert(recycler::max_number_gpus == 1);
       assert(gpu_id == 0);
     };
 
     std::deque<Pool> streampools{};
-    std::array<mutex_t, max_number_gpus> gpu_mutexes;
+    std::array<recycler::mutex_t, recycler::max_number_gpus> gpu_mutexes;
 
     static stream_pool_implementation& instance(void) {
       static stream_pool_implementation pool_instance{};

From be375086b6a9e350234b5f55fea938648b1a9041 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gregor=20Dai=C3=9F?= <Gregor.Daiss+git@gmail.com>
Date: Tue, 22 Aug 2023 15:33:23 -0500
Subject: [PATCH 32/42] Improve cmake variable names

---
 CMakeLists.txt            | 32 ++++++++++++++++----------------
 include/detail/config.hpp |  4 ++--
 2 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index fb38b230..5ff430d4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -23,13 +23,13 @@ set(CPPUDDLE_VERSION_STRING "${CPPUDDLE_VERSION_MAJOR}.${CPPUDDLE_VERSION_MINOR}
 option(CPPUDDLE_WITH_CUDA "Enable CUDA tests/examples" OFF)
 option(CPPUDDLE_WITH_MULTIGPU_SUPPORT "Enables experimental MultiGPU support" OFF)
 option(CPPUDDLE_WITH_KOKKOS "Enable KOKKOS tests/examples" OFF)
-set(CPPUDDLE_WITH_NUMBER_GPUS "1" CACHE STRING "Number of GPUs that will be used. Should match the number of GPUs used when using the maximum number of HPX worker threads. Should be 1 for non-HPX builds.")
+set(CPPUDDLE_WITH_MAX_NUMBER_GPUS "1" CACHE STRING "Number of GPUs that will be used. Should match the number of GPUs used when using the maximum number of HPX worker threads. Should be 1 for non-HPX builds.")
 # HPX-related options
 option(CPPUDDLE_WITH_HPX "Enable basic HPX integration and examples" OFF)
 option(CPPUDDLE_WITH_HPX_AWARE_ALLOCATORS "Enable HPX-aware allocators for even better HPX integration" ON)
 set(CPPUDDLE_WITH_HPX_MUTEX OFF CACHE BOOL
   "Use HPX spinlock mutex instead of std::mutex")
-set(CPPUDDLE_WITH_MAX_NUMBER_WORKERS "128" CACHE STRING "Max number of workers threads supported. Should match the intended number of HPX workers or be 1 in non-HPX builds.")
+set(CPPUDDLE_WITH_NUMBER_BUCKETS "128" CACHE STRING "Number of internal recycle buckets buffer type. Should ideally match the intended number of HPX workers or be 1 in non-HPX builds.")
 # Test-related options
 option(CPPUDDLE_WITH_COUNTERS "Turns on allocations counters. Useful for extended testing" OFF)
 option(CPPUDDLE_WITH_TESTS "Build tests/examples" OFF)
@@ -69,10 +69,10 @@ if(CPPUDDLE_WITH_NUMBER_GPUS GREATER 1)
   endif()
 endif()
 
-if(CPPUDDLE_WITH_MAX_NUMBER_WORKERS GREATER 1)
+if(CPPUDDLE_WITH_NUMBER_BUCKETS GREATER 1)
   if(NOT CPPUDDLE_WITH_HPX_AWARE_ALLOCATORS)
     message(FATAL_ERROR " CPPUDDLE_WITH_HPX_AWARE_ALLOCATORS=ON is required for Multi-Worker build! \
-    Either turn it on or configure with CPPUDDLE_WITH_MAX_NUMBER_WORKERS=1 !")
+    Either turn it on or configure with CPPUDDLE_WITH_NUMBER_BUCKETS=1 !")
   endif()
 endif()
 
@@ -164,15 +164,15 @@ if (CPPUDDLE_WITH_HPX)
   if(CPPUDDLE_WITH_HPX_AWARE_ALLOCATORS)
     message(INFO " Compiling with HPX-aware allocators!")
     target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS")
-    target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_MAX_NUMBER_GPUS=${CPPUDDLE_WITH_NUMBER_GPUS}")
-    target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_MAX_NUMBER_WORKERS=${CPPUDDLE_WITH_MAX_NUMBER_WORKERS}")
+    target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_HAVE_MAX_NUMBER_GPUS=${CPPUDDLE_WITH_MAX_NUMBER_GPUS}")
+    target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_HAVE_NUMBER_BUCKETS=${CPPUDDLE_WITH_NUMBER_BUCKETS}")
   else()
-    target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_MAX_NUMBER_GPUS=1")
-    target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_MAX_NUMBER_WORKERS=1")
+    target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_HAVE_MAX_NUMBER_GPUS=1")
+    target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_HAVE_NUMBER_BUCKETS=1")
   endif()
 else()
-  target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_MAX_NUMBER_GPUS=1")
-  target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_MAX_NUMBER_WORKERS=1")
+  target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_HAVE_MAX_NUMBER_GPUS=1")
+  target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_HAVE_NUMBER_BUCKETS=1")
 endif()
 if (CPPUDDLE_WITH_COUNTERS)
   target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_HAVE_COUNTERS")
@@ -187,15 +187,15 @@ if (CPPUDDLE_WITH_HPX)
   target_compile_definitions(stream_manager INTERFACE "CPPUDDLE_HAVE_HPX")
   if(CPPUDDLE_WITH_HPX_AWARE_ALLOCATORS)
     target_compile_definitions(stream_manager INTERFACE "CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS")
-    target_compile_definitions(stream_manager INTERFACE "CPPUDDLE_MAX_NUMBER_GPUS=${CPPUDDLE_WITH_NUMBER_GPUS}")
-    target_compile_definitions(stream_manager INTERFACE "CPPUDDLE_MAX_NUMBER_WORKERS=${CPPUDDLE_WITH_MAX_NUMBER_WORKERS}")
+    target_compile_definitions(stream_manager INTERFACE "CPPUDDLE_HAVE_MAX_NUMBER_GPUS=${CPPUDDLE_WITH_MAX_NUMBER_GPUS}")
+    target_compile_definitions(stream_manager INTERFACE "CPPUDDLE_HAVE_NUMBER_BUCKETS=${CPPUDDLE_WITH_NUMBER_BUCKETS}")
   else()
-    target_compile_definitions(stream_manager INTERFACE "CPPUDDLE_MAX_NUMBER_GPUS=1")
-    target_compile_definitions(stream_manager INTERFACE "CPPUDDLE_MAX_NUMBER_WORKERS=1")
+    target_compile_definitions(stream_manager INTERFACE "CPPUDDLE_HAVE_MAX_NUMBER_GPUS=1")
+    target_compile_definitions(stream_manager INTERFACE "CPPUDDLE_HAVE_NUMBER_BUCKETS=1")
   endif()
 else()
-  target_compile_definitions(stream_manager INTERFACE "CPPUDDLE_MAX_NUMBER_GPUS=1")
-  target_compile_definitions(stream_manager INTERFACE "CPPUDDLE_MAX_NUMBER_WORKERS=1")
+  target_compile_definitions(stream_manager INTERFACE "CPPUDDLE_HAVE_MAX_NUMBER_GPUS=1")
+  target_compile_definitions(stream_manager INTERFACE "CPPUDDLE_HAVE_NUMBER_BUCKETS=1")
 endif()
 if (CPPUDDLE_WITH_COUNTERS)
   target_compile_definitions(stream_manager INTERFACE "CPPUDDLE_HAVE_COUNTERS")
diff --git a/include/detail/config.hpp b/include/detail/config.hpp
index e118e884..b5816cf3 100644
--- a/include/detail/config.hpp
+++ b/include/detail/config.hpp
@@ -49,9 +49,9 @@ using aggregation_mutex_t = std::mutex;
 
 // Thread and MultiGPU configuration
 //
-constexpr size_t number_instances = CPPUDDLE_MAX_NUMBER_WORKERS;
+constexpr size_t number_instances = CPPUDDLE_HAVE_NUMBER_BUCKETS;
 static_assert(number_instances >= 1);
-constexpr size_t max_number_gpus = CPPUDDLE_MAX_NUMBER_GPUS;
+constexpr size_t max_number_gpus = CPPUDDLE_HAVE_MAX_NUMBER_GPUS;
 #ifndef CPPUDDLE_HAVE_HPX
 static_assert(max_number_gpus == 1, "Non HPX builds do not support multigpu");
 #endif

From 3310dee81e0314a885bad300ce87305e2f12f8ed Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gregor=20Dai=C3=9F?= <Gregor.Daiss+git@gmail.com>
Date: Tue, 22 Aug 2023 15:45:07 -0500
Subject: [PATCH 33/42] Show CI valgrind error list

---
 CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5ff430d4..1fbb9fda 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -415,12 +415,12 @@ if (CPPUDDLE_WITH_TESTS)
     find_program(VALGRIND_COMMAND valgrind)
     if (VALGRIND_COMMAND)
       add_test(allocator_memcheck.valgrind
-      ${VALGRIND_COMMAND} --trace-children=yes --leak-check=full ./allocator_test --arraysize 5000000 --passes 200)
+      ${VALGRIND_COMMAND} --trace-children=yes --leak-check=full --undef-value-errors=no --show-error-list=yes ./allocator_test --arraysize 5000000 --passes 200)
       set_tests_properties(allocator_memcheck.valgrind PROPERTIES
         PASS_REGULAR_EXPRESSION "ERROR SUMMARY: 0 errors from 0 contexts"
       )
       add_test(allocator_aligned_memcheck.valgrind
-      ${VALGRIND_COMMAND} --trace-children=yes --leak-check=full ./allocator_aligned_test --arraysize 5000000 --passes 200)
+      ${VALGRIND_COMMAND} --trace-children=yes --leak-check=full --undef-value-errors=no --show-error-list=yes ./allocator_aligned_test --arraysize 5000000 --passes 200)
       set_tests_properties(allocator_aligned_memcheck.valgrind PROPERTIES
         PASS_REGULAR_EXPRESSION "ERROR SUMMARY: 0 errors from 0 contexts"
       )

From 73fc898edcd18e25eef43caf945cbee841e3f633 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gregor=20Dai=C3=9F?= <Gregor.Daiss+git@gmail.com>
Date: Tue, 22 Aug 2023 16:15:10 -0500
Subject: [PATCH 34/42] Fix version string

---
 CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1fbb9fda..cbe132be 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -12,8 +12,8 @@ set(CMAKE_CXX_STANDARD 17)
 # Version
 
 set(CPPUDDLE_VERSION_MAJOR 0)
-set(CPPUDDLE_VERSION_MINOR 1)
-set(CPPUDDLE_VERSION_PATCH 99)
+set(CPPUDDLE_VERSION_MINOR 3)
+set(CPPUDDLE_VERSION_PATCH 0)
 set(CPPUDDLE_VERSION_STRING "${CPPUDDLE_VERSION_MAJOR}.${CPPUDDLE_VERSION_MINOR}.${CPPUDDLE_VERSION_PATCH}.")
 
 #------------------------------------------------------------------------------------------------------------

From 69126c5b53eb38fd90b60c7affe80a73ba199406 Mon Sep 17 00:00:00 2001
From: Gregor Daiss <Gregor.Daiss+git@gmail.com>
Date: Wed, 23 Aug 2023 00:59:36 -0500
Subject: [PATCH 35/42] Fix stream test

---
 tests/stream_test.hpp | 64 +++++++++++++++++++++----------------------
 1 file changed, 32 insertions(+), 32 deletions(-)

diff --git a/tests/stream_test.hpp b/tests/stream_test.hpp
index 716a8ba7..07de4c44 100644
--- a/tests/stream_test.hpp
+++ b/tests/stream_test.hpp
@@ -20,7 +20,7 @@ void test_pool_memcpy(const size_t stream_parameter, Ts &&... ts) {
   stream_pool::init<Interface, Pool>(stream_parameter, std::forward<Ts>(ts)...);
   // without interface wrapper
   {
-    auto test1 = stream_pool::get_interface<Interface, Pool>();
+    auto test1 = stream_pool::get_interface<Interface, Pool>(0);
     Interface test1_interface = std::get<0>(test1);
     size_t interface_id = std::get<1>(test1);
     hpx::apply(test1_interface, cudaMemcpyAsync, devicebuffer.device_side_buffer,
@@ -30,12 +30,12 @@ void test_pool_memcpy(const size_t stream_parameter, Ts &&... ts) {
         cudaMemcpyAsync, hostbuffer.data(), devicebuffer.device_side_buffer,
         512 * sizeof(double), cudaMemcpyDeviceToHost);
     fut1.get();
-    stream_pool::release_interface<Interface, Pool>(interface_id);
+    stream_pool::release_interface<Interface, Pool>(interface_id, 0);
   }
 
   // with interface wrapper
   {
-    stream_interface<Interface, Pool> test1_interface;
+    stream_interface<Interface, Pool> test1_interface{0};
     // hpx::cuda::cuda_executor test1_interface(0, false);
     hpx::apply(test1_interface.interface, cudaMemcpyAsync, devicebuffer.device_side_buffer,
                          hostbuffer.data(), 512 * sizeof(double),
@@ -55,43 +55,43 @@ void test_pool_ref_counting(const size_t stream_parameter, Ts &&... ts) {
   stream_pool::init<Interface, Pool>(stream_parameter, std::forward<Ts>(ts)...);
   {
     // Allocating
-    auto test1 = stream_pool::get_interface<Interface, Pool>();
-    auto load1 = stream_pool::get_current_load<Interface, Pool>();
+    auto test1 = stream_pool::get_interface<Interface, Pool>(0);
+    auto load1 = stream_pool::get_current_load<Interface, Pool>(0);
     assert(load1 == 0);
     Interface test1_interface = std::get<0>(test1);
     size_t test1_index = std::get<1>(test1);
-    auto test2 = stream_pool::get_interface<Interface, Pool>();
-    auto load2 = stream_pool::get_current_load<Interface, Pool>();
+    auto test2 = stream_pool::get_interface<Interface, Pool>(0);
+    auto load2 = stream_pool::get_current_load<Interface, Pool>(0);
     assert(load2 == 1);
     Interface test2_interface = std::get<0>(test2);
     // auto fut = test2_interface.get_future();
     size_t test2_index = std::get<1>(test2);
-    auto test3 = stream_pool::get_interface<Interface, Pool>();
-    auto load3 = stream_pool::get_current_load<Interface, Pool>();
+    auto test3 = stream_pool::get_interface<Interface, Pool>(0);
+    auto load3 = stream_pool::get_current_load<Interface, Pool>(0);
     assert(load3 == 1);
     Interface test3_interface = std::get<0>(test3);
     size_t test3_index = std::get<1>(test3);
-    auto test4 = stream_pool::get_interface<Interface, Pool>();
-    auto load4 = stream_pool::get_current_load<Interface, Pool>();
+    auto test4 = stream_pool::get_interface<Interface, Pool>(0);
+    auto load4 = stream_pool::get_current_load<Interface, Pool>(0);
     Interface test4_interface = std::get<0>(test4);
     size_t test4_index = std::get<1>(test4);
     assert(load4 == 2);
     // Releasing
-    stream_pool::release_interface<Interface, Pool>(test4_index);
-    load4 = stream_pool::get_current_load<Interface, Pool>();
+    stream_pool::release_interface<Interface, Pool>(test4_index, 0);
+    load4 = stream_pool::get_current_load<Interface, Pool>(0);
     assert(load4 == 1);
-    stream_pool::release_interface<Interface, Pool>(test3_index);
-    load3 = stream_pool::get_current_load<Interface, Pool>();
+    stream_pool::release_interface<Interface, Pool>(test3_index, 0);
+    load3 = stream_pool::get_current_load<Interface, Pool>(0);
     assert(load3 == 1);
-    stream_pool::release_interface<Interface, Pool>(test2_index);
-    load2 = stream_pool::get_current_load<Interface, Pool>();
+    stream_pool::release_interface<Interface, Pool>(test2_index, 0);
+    load2 = stream_pool::get_current_load<Interface, Pool>(0);
     assert(load2 == 0);
-    stream_pool::release_interface<Interface, Pool>(test1_index);
-    load1 = stream_pool::get_current_load<Interface, Pool>();
+    stream_pool::release_interface<Interface, Pool>(test1_index, 0);
+    load1 = stream_pool::get_current_load<Interface, Pool>(0);
     assert(load1 == 0);
   }
   // Clear
-  auto load0 = stream_pool::get_current_load<Interface, Pool>();
+  auto load0 = stream_pool::get_current_load<Interface, Pool>(0);
   assert(load0 == 0);
   stream_pool::cleanup<Interface, Pool>();
 }
@@ -102,28 +102,28 @@ void test_pool_wrappers(const size_t stream_parameter, Ts &&... ts) {
   // init ppol
   stream_pool::init<Interface, Pool>(stream_parameter, std::forward<Ts>(ts)...);
   {
-    wrapper_type test1;
-    auto load = stream_pool::get_current_load<Interface, Pool>();
+    wrapper_type test1{0};
+    auto load = stream_pool::get_current_load<Interface, Pool>(0);
     assert(load == 0);
-    wrapper_type test2;
-    load = stream_pool::get_current_load<Interface, Pool>();
+    wrapper_type test2{0};
+    load = stream_pool::get_current_load<Interface, Pool>(0);
     // auto fut = test2.get_future();
     assert(load == 1);
-    wrapper_type test3;
-    load = stream_pool::get_current_load<Interface, Pool>();
+    wrapper_type test3{0};
+    load = stream_pool::get_current_load<Interface, Pool>(0);
     assert(load == 1);
-    wrapper_type test4;
-    load = stream_pool::get_current_load<Interface, Pool>();
+    wrapper_type test4{0};
+    load = stream_pool::get_current_load<Interface, Pool>(0);
     assert(load == 2);
     // Check availability method:
-    bool avail = stream_pool::interface_available<Interface, Pool>(1);
+    bool avail = stream_pool::interface_available<Interface, Pool>(1, 0);
     assert(avail == false); // NOLINT
-    avail = stream_pool::interface_available<Interface, Pool>(2);
+    avail = stream_pool::interface_available<Interface, Pool>(2, 0);
     assert(avail == false); // NOLINT
-    avail = stream_pool::interface_available<Interface, Pool>(3);
+    avail = stream_pool::interface_available<Interface, Pool>(3, 0);
     assert(avail == true); // NOLINT
   }
-  auto load0 = stream_pool::get_current_load<Interface, Pool>();
+  auto load0 = stream_pool::get_current_load<Interface, Pool>(0);
   assert(load0 == 0);
   stream_pool::cleanup<Interface, Pool>();
 }

From de53e04fef277cd1c57aa358f0f4571c930c2caa Mon Sep 17 00:00:00 2001
From: Gregor Daiss <Gregor.Daiss+git@gmail.com>
Date: Wed, 23 Aug 2023 00:59:52 -0500
Subject: [PATCH 36/42] Fix kokkos test

---
 tests/allocator_kokkos_test.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/allocator_kokkos_test.cpp b/tests/allocator_kokkos_test.cpp
index 055fa397..2222cac3 100644
--- a/tests/allocator_kokkos_test.cpp
+++ b/tests/allocator_kokkos_test.cpp
@@ -81,7 +81,7 @@ int main(int argc, char *argv[]) {
   for (size_t pass = 0; pass < passes; pass++) {
     test_view my_wrapper_test1(1000);
     test_view my_wrapper_test2(1000);
-    test_view my_wrapper_test3(number_instances - 1, 1000); // test 1D with location id 
+    test_view my_wrapper_test3(recycler::number_instances - 1, 1000); // test 1D with location id 
     double t = 2.6;
     Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::Serial>(0, 1000),
                         KOKKOS_LAMBDA(const int n) {

From ced646f56f42c5dbf08a440bf3522b5bca4c0d89 Mon Sep 17 00:00:00 2001
From: Gregor Daiss <Gregor.Daiss+git@gmail.com>
Date: Wed, 23 Aug 2023 01:08:30 -0500
Subject: [PATCH 37/42] Fix aggregatoin test

---
 include/aggregation_manager.hpp       | 4 ++--
 include/detail/config.hpp             | 2 +-
 tests/work_aggregation_cuda_triad.cpp | 7 +++----
 tests/work_aggregation_test.cpp       | 2 +-
 4 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/include/aggregation_manager.hpp b/include/aggregation_manager.hpp
index c297056c..92ad5f8d 100644
--- a/include/aggregation_manager.hpp
+++ b/include/aggregation_manager.hpp
@@ -777,7 +777,7 @@ template <typename Executor> class Aggregated_Executor {
 #ifndef NDEBUG
         for (const auto &buffer_entry : buffer_allocations) {
           const auto &[buffer_pointer_any, buffer_size,
-                       buffer_allocation_counter, valid, location_id] =
+                       buffer_allocation_counter, valid, location_id, device_id] =
               buffer_entry;
           assert(!valid);
         }
@@ -908,7 +908,7 @@ template <typename Executor> class Aggregated_Executor {
 #ifndef NDEBUG
     for (const auto &buffer_entry : buffer_allocations) {
       const auto &[buffer_pointer_any, buffer_size, buffer_allocation_counter,
-                   valid, location_id] = buffer_entry;
+                   valid, location_id, device_id] = buffer_entry;
       assert(!valid);
     }
 #endif
diff --git a/include/detail/config.hpp b/include/detail/config.hpp
index b5816cf3..1764ffff 100644
--- a/include/detail/config.hpp
+++ b/include/detail/config.hpp
@@ -62,7 +62,7 @@ static_assert(max_number_gpus > 0);
 /// Uses HPX thread information to determine which GPU should be used
 inline size_t get_device_id(const size_t number_gpus) {
 #if defined(CPPUDDLE_HAVE_HPX) 
-    assert(number_gpus < max_number_gpus);
+    assert(number_gpus <= max_number_gpus);
     return hpx::get_worker_thread_num() % number_gpus; 
 #else
     return 0;
diff --git a/tests/work_aggregation_cuda_triad.cpp b/tests/work_aggregation_cuda_triad.cpp
index 2ded567a..f3f6ec92 100644
--- a/tests/work_aggregation_cuda_triad.cpp
+++ b/tests/work_aggregation_cuda_triad.cpp
@@ -28,7 +28,6 @@ __global__ void __launch_bounds__(1024, 2) triad_kernel(float_t *A, const float_
 //===============================================================================
 //===============================================================================
 int hpx_main(int argc, char *argv[]) {
-  static_assert(max_number_gpus == 1, "This test currently does not support MultiGPU builds!");
   // Init parameters
   size_t problem_size{0};
   size_t kernel_size{0};
@@ -209,7 +208,7 @@ int hpx_main(int argc, char *argv[]) {
 
                   recycler::cuda_aggregated_device_buffer<float_t,
                                                           decltype(alloc_device)>
-                      device_A(slice_exec.number_slices * kernel_size, 0,
+                      device_A(slice_exec.number_slices * kernel_size, 
                                alloc_device);
 
                   std::vector<float_t, decltype(alloc_host)> local_B(
@@ -217,7 +216,7 @@ int hpx_main(int argc, char *argv[]) {
                       alloc_host);
                   recycler::cuda_aggregated_device_buffer<float_t,
                                                           decltype(alloc_device)>
-                      device_B(slice_exec.number_slices * kernel_size, 0,
+                      device_B(slice_exec.number_slices * kernel_size, 
                                alloc_device);
                   
                   std::vector<float_t, decltype(alloc_host)> local_C(
@@ -225,7 +224,7 @@ int hpx_main(int argc, char *argv[]) {
                       alloc_host);
                   recycler::cuda_aggregated_device_buffer<float_t,
                                                           decltype(alloc_device)>
-                      device_C(slice_exec.number_slices * kernel_size, 0,
+                      device_C(slice_exec.number_slices * kernel_size,
                                alloc_device);
 
                   for (size_t i = task_id * kernel_size, j = 0;
diff --git a/tests/work_aggregation_test.cpp b/tests/work_aggregation_test.cpp
index 28a37806..25455633 100644
--- a/tests/work_aggregation_test.cpp
+++ b/tests/work_aggregation_test.cpp
@@ -605,7 +605,7 @@ void references_add_test(void) {
     auto &agg_exec =
         std::get<0>(stream_pool::get_interface<
                     Aggregated_Executor<Dummy_Executor>,
-                    round_robin_pool<Aggregated_Executor<Dummy_Executor>>>());
+                    round_robin_pool<Aggregated_Executor<Dummy_Executor>>>(0));
     std::vector<float> erg(512);
     std::vector<hpx::lcos::future<void>> slices_done_futs;
 

From 10864aa2583540b0c2ce2c2d8cdb1cdf0302e174 Mon Sep 17 00:00:00 2001
From: Gregor Daiss <Gregor.Daiss+git@gmail.com>
Date: Wed, 23 Aug 2023 01:12:58 -0500
Subject: [PATCH 38/42] Fix kokkos regex

---
 CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5ff430d4..0dd00398 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -561,12 +561,12 @@ if (CPPUDDLE_WITH_TESTS)
         add_test(allocator_kokkos_test.analyse_cleaned_buffers cat allocator_kokkos_test.out)
         set_tests_properties(allocator_kokkos_test.analyse_cleaned_buffers PROPERTIES
           FIXTURES_REQUIRED allocator_kokkos_output
-          PASS_REGULAR_EXPRESSION "--> Number cleaned up buffers:[ ]* 2"
+          PASS_REGULAR_EXPRESSION "--> Number cleaned up buffers:[ ]* 3"
         )
         add_test(allocator_kokkos_test.analyse_created_buffers cat allocator_kokkos_test.out)
         set_tests_properties(allocator_kokkos_test.analyse_created_buffers PROPERTIES
           FIXTURES_REQUIRED allocator_kokkos_output
-          PASS_REGULAR_EXPRESSION "--> Number of times a new buffer had to be created for a request:[ ]* 2"
+          PASS_REGULAR_EXPRESSION "--> Number of times a new buffer had to be created for a request:[ ]* 3"
         )
         add_test(allocator_kokkos_test.analyse_bad_allocs cat allocator_kokkos_test.out)
         set_tests_properties(allocator_kokkos_test.analyse_bad_allocs PROPERTIES

From b012fd73f9f428aecab5b42d8624b1af3912ff24 Mon Sep 17 00:00:00 2001
From: Gregor Daiss <Gregor.Daiss+git@gmail.com>
Date: Wed, 23 Aug 2023 12:31:13 -0500
Subject: [PATCH 39/42] Rename deactivate cmake flags

---
 CMakeLists.txt             | 18 ++++++++++--------
 include/buffer_manager.hpp |  4 ++--
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5e5198c0..1981b66e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -34,8 +34,8 @@ set(CPPUDDLE_WITH_NUMBER_BUCKETS "128" CACHE STRING "Number of internal recycle
 option(CPPUDDLE_WITH_COUNTERS "Turns on allocations counters. Useful for extended testing" OFF)
 option(CPPUDDLE_WITH_TESTS "Build tests/examples" OFF)
 set(CPPUDDLE_WITH_DEADLOCK_TEST_REPETITONS "100000" CACHE STRING "Number of repetitions for the aggregation executor deadlock tests")
-option(CPPUDDLE_DEACTIVATE_BUFFER_RECYCLING "Deactivates the default recycling behaviour" OFF)
-option(CPPUDDLE_DEACTIVATE_AGGRESSIVE_ALLOCATORS "Deactivates the aggressive allocators" OFF)
+option(CPPUDDLE_WITH_BUFFER_RECYCLING "Enables the default recycling behaviour! Turning this off will have a major negative performance impact and is only intended for testing!" ON)
+option(CPPUDDLE_WITH_AGGRESSIVE_CONTENT_RECYCLING "Allows the aggressive allocators variants to reuse contents from previous buffers (and thus skip initializations)" ON)
 # Tooling options
 option(CPPUDDLE_WITH_CLANG_TIDY "Enable clang tidy warnings" OFF)
 option(CPPUDDLE_WITH_CLANG_FORMAT "Enable clang format target" OFF)
@@ -213,16 +213,18 @@ else()
   message(INFO " Compiling with std::mutex!")
 endif()
 
-if(CPPUDDLE_DEACTIVATE_BUFFER_RECYCLING)
-  target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_DEACTIVATE_BUFFER_RECYCLING")
-  message(WARNING " Slow Build: Buffer recycling is deactivated. This should only be used for performance tests!")
-else()
+if(CPPUDDLE_WITH_BUFFER_RECYCLING)
   message(INFO " Using default buffer recycling behaviour!")
+else()
+  message(WARNING " Slow Build: Buffer recycling is deactivated. This should only be used for performance tests!")
+  target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_DEACTIVATE_BUFFER_RECYCLING")
 endif()
 
-if(CPPUDDLE_DEACTIVATE_AGGRESSIVE_ALLOCATORS)
+if(CPPUDDLE_WITH_AGGRESSIVE_CONTENT_RECYCLING)
+  message(INFO " Using default behaviour for aggressive content reusage (only relevant for aggressive allocators)!")
+else()
   target_compile_definitions(buffer_manager INTERFACE "CPPUDDLE_DEACTIVATE_AGGRESSIVE_ALLOCATORS")
-  message(WARNING " Slow Build: Aggressive allocators disabled. This should only be used for performance tests!")
+  message(WARNING " Slow Build: Aggressive allocators (and thus content recycling) is disabled. This should only be used for performance tests!")
 endif()
 
 # install libs with the defitions:
diff --git a/include/buffer_manager.hpp b/include/buffer_manager.hpp
index 32e751d9..92a5f46b 100644
--- a/include/buffer_manager.hpp
+++ b/include/buffer_manager.hpp
@@ -70,7 +70,7 @@ class buffer_recycler {
 // Warn about suboptimal performance without recycling
 #pragma message                                                                \
 "Warning: Building without buffer recycling! Use only for performance testing! \
-For better performance configure CPPuddle with CPPUDDLE_DEACTIVATE_BUFFER_RECYCLING=OFF!"
+For better performance configure CPPuddle with CPPUDDLE_WITH_BUFFER_RECYCLING=ON!"
 
   template <typename T, typename Host_Allocator>
   static T *get(size_t number_elements, bool manage_content_lifetime = false,
@@ -887,7 +887,7 @@ struct aggressive_recycle_allocator {
 // Warn about suboptimal performance without recycling
 #pragma message                                                                \
 "Warning: Building without content reusage for aggressive allocators! \
-For better performance configure with CPPUDDLE_DEACTIVATE_AGGRESSIVE_ALLOCATORS=OFF !"
+For better performance configure with CPPUDDLE_WITH_AGGRESSIVE_CONTENT_RECYCLING=ON !"
   template <typename... Args>
   inline void construct(T *p, Args... args) noexcept {
     ::new (static_cast<void *>(p)) T(std::forward<Args>(args)...);

From 65c8c9cd67b665b902175fe2bdf0f8178a0a4465 Mon Sep 17 00:00:00 2001
From: Gregor Daiss <Gregor.Daiss+git@gmail.com>
Date: Wed, 23 Aug 2023 15:54:55 -0500
Subject: [PATCH 40/42] Fix asserts and single gpu tests for multigpu builds

---
 include/stream_manager.hpp      | 13 ++++++++-----
 tests/allocator_kokkos_test.cpp |  2 +-
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/include/stream_manager.hpp b/include/stream_manager.hpp
index 4174af5c..40631491 100644
--- a/include/stream_manager.hpp
+++ b/include/stream_manager.hpp
@@ -179,6 +179,7 @@ class stream_pool {
       /*               "deprecated stream_pool::init does not support multigpu"); */
       auto guard = make_scoped_lock_from_array(instance().gpu_mutexes);
       instance().streampools.emplace_back(number_of_streams, executor_args...);
+      assert(instance().streampools.size() <= recycler::max_number_gpus);
     }
 
     /// Multi-GPU init where executors / interfaces on all GPUs are initialized with the same arguments
@@ -192,6 +193,7 @@ class stream_pool {
                                               executor_args...);
         }
       }
+      assert(instance().streampools.size() <= recycler::max_number_gpus);
     }
 
     /// Per-GPU init allowing for different init parameters depending on the GPU 
@@ -204,6 +206,7 @@ class stream_pool {
         instance().streampools.emplace_back(number_of_streams, 
                                             executor_args...);
       }
+      assert(instance().streampools.size() <= recycler::max_number_gpus);
     }
 
     // TODO add/rename into finalize?
@@ -215,22 +218,22 @@ class stream_pool {
 
     static std::tuple<Interface &, size_t> get_interface(const size_t gpu_id = 0) {
       std::lock_guard<recycler::mutex_t> guard(instance().gpu_mutexes[gpu_id]);
-      assert(instance().streampools.size() == recycler::max_number_gpus);
+      assert(gpu_id < instance().streampools.size());
       return instance().streampools[gpu_id].get_interface();
     }
     static void release_interface(size_t index, const size_t gpu_id = 0) {
       std::lock_guard<recycler::mutex_t> guard(instance().gpu_mutexes[gpu_id]);
-      assert(instance().streampools.size() == recycler::max_number_gpus);
+      assert(gpu_id < instance().streampools.size());
       instance().streampools[gpu_id].release_interface(index);
     }
     static bool interface_available(size_t load_limit, const size_t gpu_id = 0) {
       std::lock_guard<recycler::mutex_t> guard(instance().gpu_mutexes[gpu_id]);
-      assert(instance().streampools.size() == recycler::max_number_gpus);
+      assert(gpu_id < instance().streampools.size());
       return instance().streampools[gpu_id].interface_available(load_limit);
     }
     static size_t get_current_load(const size_t gpu_id = 0) {
       std::lock_guard<recycler::mutex_t> guard(instance().gpu_mutexes[gpu_id]);
-      assert(instance().streampools.size() == recycler::max_number_gpus);
+      assert(gpu_id < instance().streampools.size());
       return instance().streampools[gpu_id].get_current_load();
     }
     // TODO deprecated! Remove...
@@ -255,7 +258,7 @@ class stream_pool {
     recycler::mutex_t pool_mut{};
     std::function<void(size_t)> select_gpu_function = [](size_t gpu_id) {
       // By default no multi gpu support
-      assert(recycler::max_number_gpus == 1);
+      assert(recycler::max_number_gpus == 1 || instance().streampools.size() == 1);
       assert(gpu_id == 0);
     };
 
diff --git a/tests/allocator_kokkos_test.cpp b/tests/allocator_kokkos_test.cpp
index 2222cac3..e2770458 100644
--- a/tests/allocator_kokkos_test.cpp
+++ b/tests/allocator_kokkos_test.cpp
@@ -81,7 +81,7 @@ int main(int argc, char *argv[]) {
   for (size_t pass = 0; pass < passes; pass++) {
     test_view my_wrapper_test1(1000);
     test_view my_wrapper_test2(1000);
-    test_view my_wrapper_test3(recycler::number_instances - 1, 1000); // test 1D with location id 
+    test_view my_wrapper_test3(0, 1000); // test 1D with explicit device id parameter
     double t = 2.6;
     Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::Serial>(0, 1000),
                         KOKKOS_LAMBDA(const int n) {

From 6bb7fd01819b43a001420528dd78062eaa5decb2 Mon Sep 17 00:00:00 2001
From: Gregor Daiss <Gregor.Daiss+git@gmail.com>
Date: Wed, 23 Aug 2023 23:53:43 -0500
Subject: [PATCH 41/42] Check if recycling is on before running tests

---
 CMakeLists.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1981b66e..64690fc5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -245,6 +245,9 @@ install(EXPORT CPPuddle NAMESPACE CPPuddle:: DESTINATION ${CMAKE_INSTALL_PREFIX}
 
 ## Add target for tests and tests definitions
 if (CPPUDDLE_WITH_TESTS)
+  if(NOT CPPUDDLE_WITH_BUFFER_RECYCLING)
+    message(FATAL_ERROR "The CPPuddle tests only work with CPPUDDLE_WITH_BUFFER_RECYCLING=ON. Turning off buffer recycling is not recommended in general!")
+  endif()
   add_executable(allocator_test tests/allocator_test.cpp)
   if (CPPUDDLE_WITH_HPX)
   target_link_libraries(allocator_test

From 83f99bbf1e8a6be586c3041035d304a6bb1eeffa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gregor=20Dai=C3=9F?= <Gregor.Daiss+git@gmail.com>
Date: Thu, 24 Aug 2023 10:58:47 -0500
Subject: [PATCH 42/42] Fix performance test

---
 CMakeLists.txt                   | 21 +++------------------
 tests/allocator_aligned_test.cpp |  4 ++--
 tests/allocator_hpx_test.cpp     |  4 ++--
 tests/allocator_test.cpp         |  4 ++--
 4 files changed, 9 insertions(+), 24 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1981b66e..35953162 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -395,15 +395,10 @@ if (CPPUDDLE_WITH_TESTS)
     )
   endif()
   if (NOT CMAKE_BUILD_TYPE MATCHES "Debug") # Performance tests only make sense with optimizations on
-    add_test(allocator_test.performance.analyse_recycle_performance cat allocator_test.out)
-    set_tests_properties(allocator_test.performance.analyse_recycle_performance PROPERTIES
-      FIXTURES_REQUIRED allocator_test_output
-      PASS_REGULAR_EXPRESSION "Test information: Recycler was faster than default allocator!"
-    )
     add_test(allocator_test.performance.analyse_aggressive_performance cat allocator_test.out)
     set_tests_properties(allocator_test.performance.analyse_aggressive_performance PROPERTIES
       FIXTURES_REQUIRED allocator_test_output
-      PASS_REGULAR_EXPRESSION "Test information: Recycler was faster than default allocator!"
+      PASS_REGULAR_EXPRESSION "Test information: Aggressive recycler was faster than default allocator!"
     )
   endif()
   add_test(allocator_test.fixture_cleanup ${CMAKE_COMMAND} -E remove allocator_test.out)
@@ -462,15 +457,10 @@ if (CPPUDDLE_WITH_TESTS)
     )
   endif()
   if (NOT CMAKE_BUILD_TYPE MATCHES "Debug") # Performance tests only make sense with optimizations on
-    add_test(allocator_aligned_test.performance.analyse_recycle_performance cat allocator_aligned_test.out)
-    set_tests_properties(allocator_aligned_test.performance.analyse_recycle_performance PROPERTIES
-      FIXTURES_REQUIRED allocator_aligned_test_output
-      PASS_REGULAR_EXPRESSION "Test information: Recycler was faster than default allocator!"
-    )
     add_test(allocator_aligned_test.performance.analyse_aggressive_performance cat allocator_aligned_test.out)
     set_tests_properties(allocator_aligned_test.performance.analyse_aggressive_performance PROPERTIES
       FIXTURES_REQUIRED allocator_aligned_test_output
-      PASS_REGULAR_EXPRESSION "Test information: Recycler was faster than default allocator!"
+      PASS_REGULAR_EXPRESSION "Test information: Aggressive recycler was faster than default allocator!"
     )
   endif()
   add_test(allocator_aligned_test.fixture_cleanup ${CMAKE_COMMAND} -E remove allocator_aligned_test.out)
@@ -518,15 +508,10 @@ if (CPPUDDLE_WITH_TESTS)
       )
     endif()
     if (NOT CMAKE_BUILD_TYPE MATCHES "Debug") # Performance tests only make sense with optimizations on
-      add_test(allocator_concurrency_test.performance.analyse_recycle_performance cat allocator_concurrency_test.out)
-      set_tests_properties(allocator_concurrency_test.performance.analyse_recycle_performance PROPERTIES
-        FIXTURES_REQUIRED allocator_concurrency_output
-        PASS_REGULAR_EXPRESSION "Test information: Recycler was faster than default allocator!"
-      )
       add_test(allocator_concurrency_test.performance.analyse_aggressive_performance cat allocator_concurrency_test.out)
       set_tests_properties(allocator_concurrency_test.performance.analyse_aggressive_performance PROPERTIES
         FIXTURES_REQUIRED allocator_concurrency_output
-        PASS_REGULAR_EXPRESSION "Test information: Recycler was faster than default allocator!"
+        PASS_REGULAR_EXPRESSION "Test information: Aggressive recycler was faster than default allocator!"
       )
     endif()
     add_test(allocator_concurrency_test.fixture_cleanup ${CMAKE_COMMAND} -E remove allocator_concurrency_test.out)
diff --git a/tests/allocator_aligned_test.cpp b/tests/allocator_aligned_test.cpp
index 9178dfbf..c3c09217 100644
--- a/tests/allocator_aligned_test.cpp
+++ b/tests/allocator_aligned_test.cpp
@@ -142,8 +142,8 @@ int main(int argc, char *argv[]) {
                  "recycler!"
               << std::endl;
   }
-  if (recycle_duration < default_duration) {
-    std::cout << "Test information: Recycler was faster than default allocator!"
+  if (aggressive_duration < default_duration) {
+    std::cout << "Test information: Aggressive recycler was faster than default allocator!"
               << std::endl;
   }
   recycler::print_performance_counters();
diff --git a/tests/allocator_hpx_test.cpp b/tests/allocator_hpx_test.cpp
index 6ddb70ad..9d8cc44b 100644
--- a/tests/allocator_hpx_test.cpp
+++ b/tests/allocator_hpx_test.cpp
@@ -177,8 +177,8 @@ int hpx_main(int argc, char *argv[]) {
                    "recycler!"
                 << std::endl;
     }
-    if (recycle_duration < default_duration) {
-      std::cout << "Test information: Recycler was faster than default allocator!"
+    if (aggressive_duration < default_duration) {
+      std::cout << "Test information: Aggressive recycler was faster than default allocator!"
                 << std::endl;
     }
   }
diff --git a/tests/allocator_test.cpp b/tests/allocator_test.cpp
index 0d8868db..004368a4 100644
--- a/tests/allocator_test.cpp
+++ b/tests/allocator_test.cpp
@@ -134,8 +134,8 @@ int main(int argc, char *argv[]) {
                  "recycler!"
               << std::endl;
   }
-  if (recycle_duration < default_duration) {
-    std::cout << "Test information: Recycler was faster than default allocator!"
+  if (aggressive_duration < default_duration) {
+    std::cout << "Test information: Aggressive recycler was faster than default allocator!"
               << std::endl;
   }
   recycler::print_performance_counters();