Incrementally optimize the BF allocator. You can use DIPU_DEVICE_MEMC…

…ACHING_ALGORITHM=BF2 to use the optimized allocator without affecting the original BF implementation.
DeepLink-org · zhaoguochun1995 · Jul 3, 2024 · Jul 5, 2024 · Jul 23, 2024 · Jul 23, 2024
commit eb75ef995661b4349b8f1601208b49d8483bd092
@@ -82,6 +82,7 @@ def test_allocator(
             (
                 {"args": (MAX_ALLOCATE, 1, "TORCH", 0)},
                 {"args": (MAX_ALLOCATE, 1, "BF", 0)},
+                {"args": (MAX_ALLOCATE, 1, "BF2", 0)},
                 {"args": (MAX_ALLOCATE, 1, "BS", 0)},
                 {"args": (MAX_ALLOCATE, 1, "RAW", 0)},
                 {"args": (MAX_ALLOCATE, 17919, "BF", 3, False)},

@@ -116,6 +116,7 @@ set(TORCH_DIPU_SOURCE
   runtime/core/allocator/DIPURawAllocator.cpp
   runtime/core/allocator/DIPUCachingAllocator.cpp
   runtime/core/allocator/DIPUBFCachingAllocator.cpp
+  runtime/core/allocator/DIPUBFCachingAllocator2.cpp
   runtime/core/allocator/DIPUBSCachingAllocator.cpp
   runtime/core/allocator/DIPUCachingHostAllocator.cpp
   runtime/core/allocator/DIPUCachingDeviceAllocator.cpp

@@ -1,21 +1,22 @@
 // Copyright (c) 2023, DeepLink.
 
-#include <cstddef>
 #include <functional>
 #include <memory>
 #include <stack>
 #include <thread>
 #include <utility>
 #include <vector>
 
+#include "csrc_dipu/utils/env.hpp"
+
 #include "DIPUCachingAllocator.h"
 #include "DIPUSpinMutex.h"
 
 namespace dipu {
 
-inline size_t round_up_to_alignment(size_t nbytes, size_t alignment_size) {
-  return ((nbytes - 1) | (alignment_size - 1)) + 1;
-}
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+const size_t kMaxExtendSize = get_env_or_default("DIPU_MAX_EXTEND_SIZE", 1024)
+                              << 20U;
 
 class BFCachingAllocatorImpl {
  public:
@@ -30,23 +31,10 @@ class BFCachingAllocatorImpl {
   // Number of second level bins (linearly)
   static constexpr int kNumSubBins = 4;
   static constexpr int kLogNumSubBins = 2;
-
   // Allocation parameters
-  static constexpr size_t kMinBlockSize =
-      512;  // all sizes are rounded to at least 512 bytes
-  static constexpr size_t kSmallSize =
-      1048576;  // largest "small" allocation is 1 MiB
-  static constexpr size_t kSmallBuffer =
-      2097152;  // "small" allocations are packed in 2 MiB blocks
-  static constexpr size_t kLargeBuffer =
-      20971520;  // "large" allocations may be packed in 20 MiB blocks
-  static constexpr size_t kMinLargeAlloc =
-      10485760;  // allocations between 1 and 10 MiB may use kLargeBuffer
-  static constexpr size_t kRoundLarge =
-      2097152;  // round up large allocations to 2 MiB
-  static constexpr size_t kMaxSplitableBlockSize =
-      200 << 20;  // To further reduce fragmentation, blocks >= 200MB are not
-                  // allowed to be split
+  static constexpr size_t kMinAllocationSize = 512;
+  static constexpr size_t kMaxInternalFragmentation = 8U << 20U;  // 8MB
+  static constexpr size_t kMinExtendSize = 8U << 20U;             // 8MB
 
   size_t cachedBytes = 0;
   size_t allocatedBytes = 0;
@@ -79,6 +67,8 @@ class BFCachingAllocatorImpl {
     __uint128_t bits = 0;
     // Virtual chunks which are the heads of the bins
     std::array<int, static_cast<size_t>(kNumBigBins* kNumSubBins)> binHeads_{};
+    // The extending size next time
+    size_t currExtendSize_ = kMinExtendSize;
 
     explicit StreamSet(size_t id) : id(id) {}
 
@@ -150,11 +140,7 @@ class BFCachingAllocatorImpl {
   mutable mutex_t mut_;
 
   static size_t roundBytes(size_t nbytes) {
-    if (nbytes <= kMinBlockSize) {
-      return kMinBlockSize;
-    }
-    int clz = __builtin_clzll(nbytes - 1);
-    return (1LU << (sizeof(int64_t) - clz));
+    return ((nbytes - 1) | (kMinAllocationSize - 1)) + 1;
   }
 
   int newChunk(void* ptr, size_t size, size_t stream) {
@@ -177,7 +163,7 @@ class BFCachingAllocatorImpl {
     // Big bin range:
     //      [2^`bigBinIdx`, 2^(`bigBinIdx`+1)), length: 2^`bigBinIdx`
     // Split big bin into `kNumSubBins` sub bins
-    size_t nBlocks = nbytes / kMinBlockSize;
+    size_t nBlocks = nbytes / kMinAllocationSize;
     constexpr int kMaxBinIdx = 63;
     int bigBinIdx = kMaxBinIdx - __builtin_clzll(nBlocks);
     // If `nbytes` is so large, we just put it into the last
@@ -253,22 +239,16 @@ class BFCachingAllocatorImpl {
     return id;
   }
 
-  void shrink(StreamSetHandle& set, size_t try_release_size = 0) {
-    size_t released_size = 0;
+  void shrink(StreamSetHandle& set) {
     for (int binHead : set->binHeads_) {
       int k = chunks_[binHead].nextChunkInList;
       while (k) {
-        auto& chunk_k = chunks_[k];
-        if (chunk_k.isMonoBlock()) {
-          released_size += chunk_k.size;
-          releaseOnDevice(chunk_k.ptr, chunk_k.size);
+        if (chunks_[k].isMonoBlock()) {
+          releaseOnDevice(chunks_[k].ptr, chunks_[k].size);
           removeChunkFromBin(k);
           recycleIds_.push(k);
-          if (try_release_size > 0 && released_size >= try_release_size) {
-            break;
-          }
         }
-        k = chunk_k.nextChunkInList;
+        k = chunks_[k].nextChunkInList;
       }
     }
   }
@@ -311,39 +291,32 @@ class BFCachingAllocatorImpl {
     return id;
   }
 
-  size_t getAllocateSize(size_t nbytes) {
-    if (nbytes <= kSmallSize) {
-      return kSmallBuffer;
-    }
-    if (nbytes < kMinLargeAlloc) {
-      return kLargeBuffer;
-    }
-    return round_up_to_alignment(nbytes, kRoundLarge);
-  }
-
   int extend(size_t nbytes, StreamSetHandle& set) {
-    size_t allocateSize = getAllocateSize(nbytes);
-
-    void* ptr = allocateOnDevice(allocateSize);
-    if (!ptr) {
-      shrink(set, allocateSize);
-      ptr = allocateOnDevice(allocateSize);
-    }
-    if (!ptr) {
-      shrink(set);
-      ptr = allocateOnDevice(allocateSize);
-    }
-    if (!ptr) {
-      if (allocateSize > nbytes) {
-        allocateSize = nbytes;
-        ptr = allocateOnDevice(allocateSize);
+    emptyCacheWithoutLock();
+    auto& extSize = set->currExtendSize_;
+    bool increased = false;
+    while (extSize < nbytes && extSize < kMaxExtendSize) {
+      extSize *= 2;
+      increased = true;
+    }
+
+    size_t currBytes = std::max(nbytes, extSize);
+    void* ptr = allocateOnDevice(currBytes);
+    if (ptr) {
+      if (!increased && extSize < kMaxExtendSize) {
+        extSize *= 2;
+      }
+    } else {
+      if (currBytes > nbytes) {
+        currBytes = nbytes;
+        ptr = allocateOnDevice(currBytes);
       }
     }
     if (!ptr) {
       return 0;
     }
 
-    int id = newChunk(ptr, allocateSize, set->id);
+    int id = newChunk(ptr, currBytes, set->id);
     return id;
   }
 
@@ -398,7 +371,8 @@ class BFCachingAllocatorImpl {
     }
 
     if (id) {
-      if (chunks_[id].size >= (nbytes << 1)) {
+      if (chunks_[id].size >= nbytes * 2 ||
+          chunks_[id].size >= nbytes + kMaxInternalFragmentation) {
         id = split(id, nbytes);
       }
       chunks_[id].allocated = true;
@@ -532,9 +506,6 @@ class BFCachingAllocator : public CacheAllocator {
         : DataPtrContextBase(allocator, ptr, size), id_(id), nbytes_(nbytes) {}
 
     ~Context() {
-      if (size() <= 0) {
-        return;
-      }
       auto allocator_ = static_cast<const BFCachingAllocator*>(allocator());
       DIPU_DEBUG_ALLOCATOR(8, "BFCachingAllocator: add to async_mem_pool:"
                                   << ptr() << ", " << size() << " nbytes, id:"
@@ -544,22 +515,18 @@ class BFCachingAllocator : public CacheAllocator {
         if (ptr()) {
           allocator_->metrics_producer.deallocate(ptr());
           std::deque<DIPUEvent> events;
-          bool record_block = false;
           for (auto const& stream : streams()) {
             events.emplace_back();
             DIPU_DEBUG_ALLOCATOR(8, "BFCachingAllocator: record to stream:"
                                         << stream.rawstream());
             events.back().record(stream);
-            record_block = true;
           }
           allocator_->async_mem_pool()->add(std::make_tuple(ptr(), id_),
                                             events);
           allocator_->set_memory_allocated(allocator_->memory_allocated() -
                                            nbytes_);
-          if (!record_block) {
-            allocator_->restore();
-          }
         }
+        allocator_->restore();
       } else {
         DIPU_DEBUG_ALLOCATOR(8,
                              "BFCachingAllocator:~Context: destory tensor "
@@ -570,12 +537,12 @@ class BFCachingAllocator : public CacheAllocator {
 
   friend class Context;
 
-  c10::DataPtr allocate(size_t origin_size) const override {
+  c10::DataPtr allocate(size_t size) const override {
     restore();
     if (async_mem_pool()->size() > kMaxAsyncResourcePoolLength) {
       try_empty_resource_pool();
     }
-    size_t size = getMemoryAlignmentStrategy()->roundBytes(origin_size);
+    size = getMemoryAlignmentStrategy()->roundBytes(size);
     std::tuple<void*, int, size_t> block = impl->allocateRaw(size);
     void* ptr = std::get<0>(block);
     if (ptr == nullptr && size > 0) {
@@ -601,8 +568,8 @@ class BFCachingAllocator : public CacheAllocator {
                           deleteBFContext, device());
     DIPU_DEBUG_ALLOCATOR(
         4, "BFCachingAllocator: malloc "
-               << nbytes << ",requires " << origin_size
-               << " nbytes, ptr:" << ptr << ",device:" << device()
+               << nbytes << ",requires " << size << " nbytes, ptr:" << ptr
+               << ",device:" << device()
                << ",async_mempool.size:" << async_mem_pool()->size());
     c10::reportMemoryUsageToProfiler(
         ptr, static_cast<int64_t>(nbytes), memory_allocated(),