Skip to content

Commit

Permalink
Incrementally optimize the BF allocator. You can use DIPU_DEVICE_MEMC…
Browse files Browse the repository at this point in the history
…ACHING_ALGORITHM=BF2 to use the optimized allocator without affecting the original BF implementation.
  • Loading branch information
zhaoguochun1995 committed Jul 23, 2024
1 parent a248c16 commit eb75ef9
Show file tree
Hide file tree
Showing 4 changed files with 694 additions and 75 deletions.
1 change: 1 addition & 0 deletions dipu/tests/python/individual_scripts/test_allocator.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ def test_allocator(
(
{"args": (MAX_ALLOCATE, 1, "TORCH", 0)},
{"args": (MAX_ALLOCATE, 1, "BF", 0)},
{"args": (MAX_ALLOCATE, 1, "BF2", 0)},
{"args": (MAX_ALLOCATE, 1, "BS", 0)},
{"args": (MAX_ALLOCATE, 1, "RAW", 0)},
{"args": (MAX_ALLOCATE, 17919, "BF", 3, False)},
Expand Down
1 change: 1 addition & 0 deletions dipu/torch_dipu/csrc_dipu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ set(TORCH_DIPU_SOURCE
runtime/core/allocator/DIPURawAllocator.cpp
runtime/core/allocator/DIPUCachingAllocator.cpp
runtime/core/allocator/DIPUBFCachingAllocator.cpp
runtime/core/allocator/DIPUBFCachingAllocator2.cpp
runtime/core/allocator/DIPUBSCachingAllocator.cpp
runtime/core/allocator/DIPUCachingHostAllocator.cpp
runtime/core/allocator/DIPUCachingDeviceAllocator.cpp
Expand Down
Original file line number Diff line number Diff line change
@@ -1,21 +1,22 @@
// Copyright (c) 2023, DeepLink.

#include <cstddef>
#include <functional>
#include <memory>
#include <stack>
#include <thread>
#include <utility>
#include <vector>

#include "csrc_dipu/utils/env.hpp"

#include "DIPUCachingAllocator.h"
#include "DIPUSpinMutex.h"

namespace dipu {

inline size_t round_up_to_alignment(size_t nbytes, size_t alignment_size) {
return ((nbytes - 1) | (alignment_size - 1)) + 1;
}
// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
const size_t kMaxExtendSize = get_env_or_default("DIPU_MAX_EXTEND_SIZE", 1024)
<< 20U;

class BFCachingAllocatorImpl {
public:
Expand All @@ -30,23 +31,10 @@ class BFCachingAllocatorImpl {
// Number of second level bins (linearly)
static constexpr int kNumSubBins = 4;
static constexpr int kLogNumSubBins = 2;

// Allocation parameters
static constexpr size_t kMinBlockSize =
512; // all sizes are rounded to at least 512 bytes
static constexpr size_t kSmallSize =
1048576; // largest "small" allocation is 1 MiB
static constexpr size_t kSmallBuffer =
2097152; // "small" allocations are packed in 2 MiB blocks
static constexpr size_t kLargeBuffer =
20971520; // "large" allocations may be packed in 20 MiB blocks
static constexpr size_t kMinLargeAlloc =
10485760; // allocations between 1 and 10 MiB may use kLargeBuffer
static constexpr size_t kRoundLarge =
2097152; // round up large allocations to 2 MiB
static constexpr size_t kMaxSplitableBlockSize =
200 << 20; // To further reduce fragmentation, blocks >= 200MB are not
// allowed to be split
static constexpr size_t kMinAllocationSize = 512;
static constexpr size_t kMaxInternalFragmentation = 8U << 20U; // 8MB
static constexpr size_t kMinExtendSize = 8U << 20U; // 8MB

size_t cachedBytes = 0;
size_t allocatedBytes = 0;
Expand Down Expand Up @@ -79,6 +67,8 @@ class BFCachingAllocatorImpl {
__uint128_t bits = 0;
// Virtual chunks which are the heads of the bins
std::array<int, static_cast<size_t>(kNumBigBins* kNumSubBins)> binHeads_{};
// The extending size next time
size_t currExtendSize_ = kMinExtendSize;

explicit StreamSet(size_t id) : id(id) {}

Expand Down Expand Up @@ -150,11 +140,7 @@ class BFCachingAllocatorImpl {
mutable mutex_t mut_;

static size_t roundBytes(size_t nbytes) {
if (nbytes <= kMinBlockSize) {
return kMinBlockSize;
}
int clz = __builtin_clzll(nbytes - 1);
return (1LU << (sizeof(int64_t) - clz));
return ((nbytes - 1) | (kMinAllocationSize - 1)) + 1;
}

int newChunk(void* ptr, size_t size, size_t stream) {
Expand All @@ -177,7 +163,7 @@ class BFCachingAllocatorImpl {
// Big bin range:
// [2^`bigBinIdx`, 2^(`bigBinIdx`+1)), length: 2^`bigBinIdx`
// Split big bin into `kNumSubBins` sub bins
size_t nBlocks = nbytes / kMinBlockSize;
size_t nBlocks = nbytes / kMinAllocationSize;
constexpr int kMaxBinIdx = 63;
int bigBinIdx = kMaxBinIdx - __builtin_clzll(nBlocks);
// If `nbytes` is so large, we just put it into the last
Expand Down Expand Up @@ -253,22 +239,16 @@ class BFCachingAllocatorImpl {
return id;
}

void shrink(StreamSetHandle& set, size_t try_release_size = 0) {
size_t released_size = 0;
void shrink(StreamSetHandle& set) {
for (int binHead : set->binHeads_) {
int k = chunks_[binHead].nextChunkInList;
while (k) {
auto& chunk_k = chunks_[k];
if (chunk_k.isMonoBlock()) {
released_size += chunk_k.size;
releaseOnDevice(chunk_k.ptr, chunk_k.size);
if (chunks_[k].isMonoBlock()) {
releaseOnDevice(chunks_[k].ptr, chunks_[k].size);
removeChunkFromBin(k);
recycleIds_.push(k);
if (try_release_size > 0 && released_size >= try_release_size) {
break;
}
}
k = chunk_k.nextChunkInList;
k = chunks_[k].nextChunkInList;
}
}
}
Expand Down Expand Up @@ -311,39 +291,32 @@ class BFCachingAllocatorImpl {
return id;
}

size_t getAllocateSize(size_t nbytes) {
if (nbytes <= kSmallSize) {
return kSmallBuffer;
}
if (nbytes < kMinLargeAlloc) {
return kLargeBuffer;
}
return round_up_to_alignment(nbytes, kRoundLarge);
}

int extend(size_t nbytes, StreamSetHandle& set) {
size_t allocateSize = getAllocateSize(nbytes);

void* ptr = allocateOnDevice(allocateSize);
if (!ptr) {
shrink(set, allocateSize);
ptr = allocateOnDevice(allocateSize);
}
if (!ptr) {
shrink(set);
ptr = allocateOnDevice(allocateSize);
}
if (!ptr) {
if (allocateSize > nbytes) {
allocateSize = nbytes;
ptr = allocateOnDevice(allocateSize);
emptyCacheWithoutLock();
auto& extSize = set->currExtendSize_;
bool increased = false;
while (extSize < nbytes && extSize < kMaxExtendSize) {
extSize *= 2;
increased = true;
}

size_t currBytes = std::max(nbytes, extSize);
void* ptr = allocateOnDevice(currBytes);
if (ptr) {
if (!increased && extSize < kMaxExtendSize) {
extSize *= 2;
}
} else {
if (currBytes > nbytes) {
currBytes = nbytes;
ptr = allocateOnDevice(currBytes);
}
}
if (!ptr) {
return 0;
}

int id = newChunk(ptr, allocateSize, set->id);
int id = newChunk(ptr, currBytes, set->id);
return id;
}

Expand Down Expand Up @@ -398,7 +371,8 @@ class BFCachingAllocatorImpl {
}

if (id) {
if (chunks_[id].size >= (nbytes << 1)) {
if (chunks_[id].size >= nbytes * 2 ||
chunks_[id].size >= nbytes + kMaxInternalFragmentation) {
id = split(id, nbytes);
}
chunks_[id].allocated = true;
Expand Down Expand Up @@ -532,9 +506,6 @@ class BFCachingAllocator : public CacheAllocator {
: DataPtrContextBase(allocator, ptr, size), id_(id), nbytes_(nbytes) {}

~Context() {
if (size() <= 0) {
return;
}
auto allocator_ = static_cast<const BFCachingAllocator*>(allocator());
DIPU_DEBUG_ALLOCATOR(8, "BFCachingAllocator: add to async_mem_pool:"
<< ptr() << ", " << size() << " nbytes, id:"
Expand All @@ -544,22 +515,18 @@ class BFCachingAllocator : public CacheAllocator {
if (ptr()) {
allocator_->metrics_producer.deallocate(ptr());
std::deque<DIPUEvent> events;
bool record_block = false;
for (auto const& stream : streams()) {
events.emplace_back();
DIPU_DEBUG_ALLOCATOR(8, "BFCachingAllocator: record to stream:"
<< stream.rawstream());
events.back().record(stream);
record_block = true;
}
allocator_->async_mem_pool()->add(std::make_tuple(ptr(), id_),
events);
allocator_->set_memory_allocated(allocator_->memory_allocated() -
nbytes_);
if (!record_block) {
allocator_->restore();
}
}
allocator_->restore();
} else {
DIPU_DEBUG_ALLOCATOR(8,
"BFCachingAllocator:~Context: destory tensor "
Expand All @@ -570,12 +537,12 @@ class BFCachingAllocator : public CacheAllocator {

friend class Context;

c10::DataPtr allocate(size_t origin_size) const override {
c10::DataPtr allocate(size_t size) const override {
restore();
if (async_mem_pool()->size() > kMaxAsyncResourcePoolLength) {
try_empty_resource_pool();
}
size_t size = getMemoryAlignmentStrategy()->roundBytes(origin_size);
size = getMemoryAlignmentStrategy()->roundBytes(size);
std::tuple<void*, int, size_t> block = impl->allocateRaw(size);
void* ptr = std::get<0>(block);
if (ptr == nullptr && size > 0) {
Expand All @@ -601,8 +568,8 @@ class BFCachingAllocator : public CacheAllocator {
deleteBFContext, device());
DIPU_DEBUG_ALLOCATOR(
4, "BFCachingAllocator: malloc "
<< nbytes << ",requires " << origin_size
<< " nbytes, ptr:" << ptr << ",device:" << device()
<< nbytes << ",requires " << size << " nbytes, ptr:" << ptr
<< ",device:" << device()
<< ",async_mempool.size:" << async_mem_pool()->size());
c10::reportMemoryUsageToProfiler(
ptr, static_cast<int64_t>(nbytes), memory_allocated(),
Expand Down
Loading

0 comments on commit eb75ef9

Please sign in to comment.