From b1ac4459b30f0c2063ea85996b2f0375f1d9df33 Mon Sep 17 00:00:00 2001
From: Mike Wilson <hyperbolic2346@users.noreply.github.com>
Date: Mon, 2 Nov 2020 18:39:19 -0500
Subject: [PATCH] Tracking resource adaptor to catch memory leaks (#596)

`tracking_resource_adaptor` tracks memory allocations and can return or log remaining allocations to track leaks. Fixes #467.

Co-authored-by: Conor Hoekstra <36027403+codereport@users.noreply.github.com>
Co-authored-by: Mark Harris <mharris@nvidia.com>
---
 CHANGELOG.md                                  |   3 +-
 CMakeLists.txt                                |   3 +-
 include/rmm/detail/stack_trace.hpp            |  85 ++++++
 .../mr/device/tracking_resource_adaptor.hpp   | 261 ++++++++++++++++++
 tests/CMakeLists.txt                          |   9 +-
 tests/mr/device/tracking_mr_tests.cpp         |  93 +++++++
 6 files changed, 450 insertions(+), 4 deletions(-)
 create mode 100644 include/rmm/detail/stack_trace.hpp
 create mode 100644 include/rmm/mr/device/tracking_resource_adaptor.hpp
 create mode 100644 tests/mr/device/tracking_mr_tests.cpp
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 48bd99b09..4281e5148 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,7 +2,8 @@
 
 ## New Features
 
- - PR #608 Add stream wrapper type
+- PR #596 Add `tracking_memory_resource_adaptor` to help catch memory leaks
+- PR #608 Add stream wrapper type
 
 ## Improvements
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 16af5b494..97d03d803 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -92,8 +92,7 @@ else()
   target_link_libraries(rmm INTERFACE CUDA::cudart)
 endif(CUDA_STATIC_RUNTIME)
 
-target_link_libraries(rmm INTERFACE rmm::Thrust)
-target_link_libraries(rmm INTERFACE spdlog::spdlog_header_only)
+target_link_libraries(rmm INTERFACE rmm::Thrust spdlog::spdlog_header_only ${CMAKE_DL_LIBS})
 
 ###################################################################################################
 # Set logging level. Must go before including gtests and benchmarks. 
diff --git a/include/rmm/detail/stack_trace.hpp b/include/rmm/detail/stack_trace.hpp
new file mode 100644
index 000000000..91a74401f
--- /dev/null
+++ b/include/rmm/detail/stack_trace.hpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+// execinfo is a linux-only library, so stack traces will only be available on
+// linux systems.
+#if (defined(__GNUC__) && !defined(__MINGW32__) && !defined(__MINGW64__))
+#define RMM_ENABLE_STACK_TRACES
+#endif
+
+#include <sstream>
+
+#if defined(RMM_ENABLE_STACK_TRACES)
+#include <execinfo.h>
+#include <memory>
+#include <vector>
+#endif
+
+namespace rmm {
+
+namespace detail {
+
+/**
+ * @brief stack_trace is a class that will capture a stack on instatiation for output later.
+ * It can then be used in an output stream to display stack information.
+ *
+ * rmm::detail::stack_trace saved_stack;
+ *
+ * std::cout << "callstack: " << saved_stack;
+ *
+ */
+class stack_trace {
+ public:
+  stack_trace()
+  {
+#if defined(RMM_ENABLE_STACK_TRACES)
+    const int MaxStackDepth = 64;
+    void* stack[MaxStackDepth];
+    auto const depth = backtrace(stack, MaxStackDepth);
+    stack_ptrs.insert(stack_ptrs.end(), &stack[0], &stack[depth]);
+#endif  // RMM_ENABLE_STACK_TRACES
+  }
+
+  friend std::ostream& operator<<(std::ostream& os, const stack_trace& st)
+  {
+#if defined(RMM_ENABLE_STACK_TRACES)
+    std::unique_ptr<char*, decltype(&::free)> strings(
+      backtrace_symbols(st.stack_ptrs.data(), st.stack_ptrs.size()), &::free);
+    if (strings.get() == nullptr) {
+      os << "But no stack trace could be found!" << std::endl;
+    } else {
+      ///@todo: support for demangling of C++ symbol names
+      for (int i = 0; i < st.stack_ptrs.size(); ++i) {
+        os << "#" << i << " in " << strings.get()[i] << std::endl;
+      }
+    }
+#else
+    os << "stack traces disabled" << std::endl;
+#endif  // RMM_ENABLE_STACK_TRACES
+    return os;
+  };
+
+#if defined(RMM_ENABLE_STACK_TRACES)
+ private:
+  std::vector<void*> stack_ptrs;
+#endif  // RMM_ENABLE_STACK_TRACES
+};
+
+}  // namespace detail
+
+}  // namespace rmm
diff --git a/include/rmm/mr/device/tracking_resource_adaptor.hpp b/include/rmm/mr/device/tracking_resource_adaptor.hpp
new file mode 100644
index 000000000..aa6687fb7
--- /dev/null
+++ b/include/rmm/mr/device/tracking_resource_adaptor.hpp
@@ -0,0 +1,261 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <map>
+#include <mutex>
+#include <rmm/detail/error.hpp>
+#include <rmm/detail/stack_trace.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+#include <shared_mutex>
+#include <sstream>
+
+namespace rmm {
+namespace mr {
+/**
+ * @brief Resource that uses `Upstream` to allocate memory and tracks allocations.
+ *
+ * An instance of this resource can be constructed with an existing, upstream
+ * resource in order to satisfy allocation requests, but any existing allocations
+ * will be untracked. Tracking stores a size and pointer for every allocation, and a stack
+ * frame if `capture_stacks` is true, so it can add significant overhead.
+ * `tracking_resource_adaptor` is intended as a debug adaptor and shouldn't be used in
+ * performance-sensitive code. Note that callstacks may not contain all symbols unless
+ * the project is linked with `-rdynamic`. This can be accomplished with
+ * `add_link_options(-rdynamic)` in cmake.
+ *
+ * @tparam Upstream Type of the upstream resource used for
+ * allocation/deallocation.
+ */
+template <typename Upstream>
+class tracking_resource_adaptor final : public device_memory_resource {
+ public:
+  // can be a std::shared_mutex once C++17 is adopted
+  using read_lock_t  = std::shared_lock<std::shared_timed_mutex>;
+  using write_lock_t = std::unique_lock<std::shared_timed_mutex>;
+
+  /**
+   * @brief Information stored about an allocation. Includes the size
+   * and a stack trace if the `tracking_resource_adaptor` was initialized
+   * to capture stacks.
+   *
+   */
+  struct allocation_info {
+    std::unique_ptr<rmm::detail::stack_trace> strace;
+    std::size_t allocation_size;
+
+    allocation_info() = delete;
+    allocation_info(std::size_t size, bool capture_stack)
+      : strace{[&]() {
+          return capture_stack ? std::make_unique<rmm::detail::stack_trace>() : nullptr;
+        }()},
+        allocation_size{size} {};
+  };
+
+  /**
+   * @brief Construct a new tracking resource adaptor using `upstream` to satisfy
+   * allocation requests.
+   *
+   * @throws `rmm::logic_error` if `upstream == nullptr`
+   *
+   * @param upstream The resource used for allocating/deallocating device memory
+   * @param capture_stacks If true, capture stacks for allocation calls
+   */
+  tracking_resource_adaptor(Upstream* upstream, bool capture_stacks = false)
+    : upstream_{upstream}, capture_stacks_{capture_stacks}, allocated_bytes_{0}
+  {
+    RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer.");
+  }
+
+  tracking_resource_adaptor()                                 = delete;
+  ~tracking_resource_adaptor()                                = default;
+  tracking_resource_adaptor(tracking_resource_adaptor const&) = delete;
+  tracking_resource_adaptor(tracking_resource_adaptor&&)      = default;
+  tracking_resource_adaptor& operator=(tracking_resource_adaptor const&) = delete;
+  tracking_resource_adaptor& operator=(tracking_resource_adaptor&&) = default;
+
+  /**
+   * @brief Return pointer to the upstream resource.
+   *
+   * @return Upstream* Pointer to the upstream resource.
+   */
+  Upstream* get_upstream() const noexcept { return upstream_; }
+
+  /**
+   * @brief Checks whether the upstream resource supports streams.
+   *
+   * @return true The upstream resource supports streams
+   * @return false The upstream resource does not support streams.
+   */
+  bool supports_streams() const noexcept override { return upstream_->supports_streams(); }
+
+  /**
+   * @brief Query whether the resource supports the get_mem_info API.
+   *
+   * @return bool true if the upstream resource supports get_mem_info, false otherwise.
+   */
+  bool supports_get_mem_info() const noexcept override
+  {
+    return upstream_->supports_get_mem_info();
+  }
+
+  /**
+   * @brief Get the outstanding allocations map
+   *
+   * @return std::map<void*, allocation_info> const& of a map of allocations. The key
+   * is the allocated memory pointer and the data is the allocation_info structure, which
+   * contains size and, potentially, stack traces.
+   */
+  std::map<void*, allocation_info> const& get_outstanding_allocations() const noexcept
+  {
+    return allocations_;
+  }
+
+  /**
+   * @brief Query the number of bytes that have been allocated. Note that
+   * this can not be used to know how large of an allocation is possible due
+   * to both possible fragmentation and also internal page sizes and alignment
+   * that is not tracked by this allocator.
+   *
+   * @return std::size_t number of bytes that have been allocated through this
+   * allocator.
+   */
+  std::size_t get_allocated_bytes() const noexcept { return allocated_bytes_; }
+
+  /**
+   * @brief Log any outstanding allocations via RMM_LOG_DEBUG
+   *
+   */
+  void log_outstanding_allocations() const
+  {
+#if SPDLOG_ACTIVE_LEVEL <= SPDLOG_LEVEL_DEBUG
+    read_lock_t lock(mtx);
+    if (not allocations.empty()) {
+      std::ostringstream oss;
+      for (auto const& al : allocations) {
+        oss << al.first << ": " << al.second.allocation_size << " B";
+        if (al.second.strace != nullptr) {
+          oss << " : callstack:" << std::endl << *al.second.strace;
+        }
+        oss << std::endl;
+      }
+      RMM_LOG_DEBUG("Outstanding Allocations: {}", oss.str());
+    }
+#endif  // SPDLOG_ACTIVE_LEVEL <= SPDLOG_LEVEL_DEBUG
+  }
+
+ private:
+  /**
+   * @brief Allocates memory of size at least `bytes` using the upstream
+   * resource as long as it fits inside the allocation limit.
+   *
+   * The returned pointer has at least 256B alignment.
+   *
+   * @throws `rmm::bad_alloc` if the requested allocation could not be fulfilled
+   * by the upstream resource.
+   *
+   * @param bytes The size, in bytes, of the allocation
+   * @param stream Stream on which to perform the allocation
+   * @return void* Pointer to the newly allocated memory
+   */
+  void* do_allocate(std::size_t bytes, cuda_stream_view stream) override
+  {
+    void* p = upstream_->allocate(bytes, stream);
+
+    // track it.
+    {
+      write_lock_t lock(mtx_);
+      allocations_.emplace(p, allocation_info{bytes, capture_stacks_});
+    }
+    allocated_bytes_ += bytes;
+
+    return p;
+  }
+
+  /**
+   * @brief Free allocation of size `bytes` pointed to by `p`
+   *
+   * @throws Nothing.
+   *
+   * @param p Pointer to be deallocated
+   * @param bytes Size of the allocation
+   * @param stream Stream on which to perform the deallocation
+   */
+  void do_deallocate(void* p, std::size_t bytes, cuda_stream_view stream) override
+  {
+    upstream_->deallocate(p, bytes, stream);
+    {
+      write_lock_t lock(mtx_);
+      allocations_.erase(p);
+    }
+    allocated_bytes_ -= bytes;
+  }
+
+  /**
+   * @brief Compare the upstream resource to another.
+   *
+   * @throws Nothing.
+   *
+   * @param other The other resource to compare to
+   * @return true If the two resources are equivalent
+   * @return false If the two resources are not equal
+   */
+  bool do_is_equal(device_memory_resource const& other) const noexcept override
+  {
+    if (this == &other)
+      return true;
+    else {
+      auto cast = dynamic_cast<tracking_resource_adaptor<Upstream> const*>(&other);
+      return cast != nullptr ? upstream_->is_equal(*cast->get_upstream())
+                             : upstream_->is_equal(other);
+    }
+  }
+
+  /**
+   * @brief Get free and available memory from upstream resource.
+   *
+   * @throws `rmm::cuda_error` if unable to retrieve memory info.
+   *
+   * @param stream Stream on which to get the mem info.
+   * @return std::pair contaiing free_size and total_size of memory
+   */
+  std::pair<std::size_t, std::size_t> do_get_mem_info(cuda_stream_view stream) const override
+  {
+    return upstream_->get_mem_info(stream);
+  }
+
+  bool capture_stacks_;                           // whether or not to capture call stacks
+  std::map<void*, allocation_info> allocations_;  // map of active allocations
+  std::atomic<std::size_t> allocated_bytes_;      // number of bytes currently allocated
+  std::shared_timed_mutex mutable mtx_;           // mutex for thread safe access to allocations_
+  Upstream* upstream_;  // the upstream resource used for satisfying allocation requests
+};
+
+/**
+ * @brief Convenience factory to return a `tracking_resource_adaptor` around the
+ * upstream resource `upstream`.
+ *
+ * @tparam Upstream Type of the upstream `device_memory_resource`.
+ * @param upstream Pointer to the upstream resource
+ */
+template <typename Upstream>
+tracking_resource_adaptor<Upstream> make_tracking_adaptor(Upstream* upstream)
+{
+  return tracking_resource_adaptor<Upstream>{upstream};
+}
+
+}  // namespace mr
+}  // namespace rmm
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index e787aa017..1c62a130e 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -82,6 +82,14 @@ set(THRUST_ALLOCATOR_TEST_SRC
     
 ConfigureTest(THRUST_ALLOCATOR_TEST "${THRUST_ALLOCATOR_TEST_SRC}")
 
+###################################################################################################
+# - tracking adaptor tests
+
+set(TRACKING_TEST_SRC
+    "${CMAKE_CURRENT_SOURCE_DIR}/mr/device/tracking_mr_tests.cpp")
+
+ConfigureTest(TRACKING_TEST "${TRACKING_TEST_SRC}")
+
 ###################################################################################################
 # - limiting adaptor tests
 
@@ -89,7 +97,6 @@ set(LIMITING_TEST_SRC
     "${CMAKE_CURRENT_SOURCE_DIR}/mr/device/limiting_mr_tests.cpp")
 
 ConfigureTest(LIMITING_TEST "${LIMITING_TEST_SRC}")
-target_compile_definitions(LIMITING_TEST PUBLIC CUDA_API_PER_THREAD_DEFAULT_STREAM)
 
 ###################################################################################################
 # - host mr tests
diff --git a/tests/mr/device/tracking_mr_tests.cpp b/tests/mr/device/tracking_mr_tests.cpp
new file mode 100644
index 000000000..81e7e64f8
--- /dev/null
+++ b/tests/mr/device/tracking_mr_tests.cpp
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <rmm/detail/error.hpp>
+#include <rmm/device_buffer.hpp>
+#include <rmm/mr/device/tracking_resource_adaptor.hpp>
+#include "mr_test.hpp"
+
+#include <gtest/gtest.h>
+
+namespace rmm {
+namespace test {
+namespace {
+
+using tracking_adaptor = rmm::mr::tracking_resource_adaptor<rmm::mr::device_memory_resource>;
+
+TEST(TrackingTest, ThrowOnNullUpstream)
+{
+  auto construct_nullptr = []() { tracking_adaptor mr{nullptr}; };
+  EXPECT_THROW(construct_nullptr(), rmm::logic_error);
+}
+
+TEST(TrackingTest, Empty)
+{
+  tracking_adaptor mr{rmm::mr::get_current_device_resource()};
+  EXPECT_EQ(mr.get_outstanding_allocations().size(), 0);
+  EXPECT_EQ(mr.get_allocated_bytes(), 0);
+}
+
+TEST(TrackingTest, AllFreed)
+{
+  tracking_adaptor mr{rmm::mr::get_current_device_resource()};
+  std::vector<void *> allocations;
+  for (int i = 0; i < 10; ++i) {
+    allocations.push_back(mr.allocate(10_MiB));
+  }
+  for (auto p : allocations) {
+    mr.deallocate(p, 10_MiB);
+  }
+  EXPECT_EQ(mr.get_outstanding_allocations().size(), 0);
+  EXPECT_EQ(mr.get_allocated_bytes(), 0);
+}
+
+TEST(TrackingTest, AllocationsLeftWithStacks)
+{
+  tracking_adaptor mr{rmm::mr::get_current_device_resource(), true};
+  std::vector<void *> allocations;
+  for (int i = 0; i < 10; ++i) {
+    allocations.push_back(mr.allocate(10_MiB));
+  }
+  for (int i = 0; i < 10; i += 2) {
+    mr.deallocate(allocations[i], 10_MiB);
+  }
+  EXPECT_EQ(mr.get_outstanding_allocations().size(), 5);
+  EXPECT_EQ(mr.get_allocated_bytes(), 50_MiB);
+  auto const &outstanding_allocations = mr.get_outstanding_allocations();
+  EXPECT_EQ(outstanding_allocations.size(), 5);
+  EXPECT_NE(outstanding_allocations.begin()->second.strace, nullptr);
+}
+
+TEST(TrackingTest, AllocationsLeftWithoutStacks)
+{
+  tracking_adaptor mr{rmm::mr::get_current_device_resource()};
+  std::vector<void *> allocations;
+  for (int i = 0; i < 10; ++i) {
+    allocations.push_back(mr.allocate(10_MiB));
+  }
+  for (int i = 0; i < 10; i += 2) {
+    mr.deallocate(allocations[i], 10_MiB);
+  }
+  EXPECT_EQ(mr.get_outstanding_allocations().size(), 5);
+  EXPECT_EQ(mr.get_allocated_bytes(), 50_MiB);
+  auto const &outstanding_allocations = mr.get_outstanding_allocations();
+  EXPECT_EQ(outstanding_allocations.size(), 5);
+  EXPECT_EQ(outstanding_allocations.begin()->second.strace, nullptr);
+}
+
+}  // namespace
+}  // namespace test
+}  // namespace rmm