Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

optimize bf allocator #869

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Incrementally optimize the BF allocator. You can use DIPU_DEVICE_MEMC…
…ACHING_ALGORITHM=BF2 to use the optimized allocator without affecting the original BF implementation.
  • Loading branch information
zhaoguochun1995 committed Jul 23, 2024
commit eb75ef995661b4349b8f1601208b49d8483bd092
1 change: 1 addition & 0 deletions dipu/tests/python/individual_scripts/test_allocator.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ def test_allocator(
(
{"args": (MAX_ALLOCATE, 1, "TORCH", 0)},
{"args": (MAX_ALLOCATE, 1, "BF", 0)},
{"args": (MAX_ALLOCATE, 1, "BF2", 0)},
{"args": (MAX_ALLOCATE, 1, "BS", 0)},
{"args": (MAX_ALLOCATE, 1, "RAW", 0)},
{"args": (MAX_ALLOCATE, 17919, "BF", 3, False)},
Expand Down
1 change: 1 addition & 0 deletions dipu/torch_dipu/csrc_dipu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ set(TORCH_DIPU_SOURCE
runtime/core/allocator/DIPURawAllocator.cpp
runtime/core/allocator/DIPUCachingAllocator.cpp
runtime/core/allocator/DIPUBFCachingAllocator.cpp
runtime/core/allocator/DIPUBFCachingAllocator2.cpp
runtime/core/allocator/DIPUBSCachingAllocator.cpp
runtime/core/allocator/DIPUCachingHostAllocator.cpp
runtime/core/allocator/DIPUCachingDeviceAllocator.cpp
Expand Down
Original file line number Diff line number Diff line change
@@ -1,21 +1,22 @@
// Copyright (c) 2023, DeepLink.

#include <cstddef>
#include <functional>
#include <memory>
#include <stack>
#include <thread>
#include <utility>
#include <vector>

#include "csrc_dipu/utils/env.hpp"

#include "DIPUCachingAllocator.h"
#include "DIPUSpinMutex.h"

namespace dipu {

inline size_t round_up_to_alignment(size_t nbytes, size_t alignment_size) {
return ((nbytes - 1) | (alignment_size - 1)) + 1;
}
// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
const size_t kMaxExtendSize = get_env_or_default("DIPU_MAX_EXTEND_SIZE", 1024)
<< 20U;

class BFCachingAllocatorImpl {
public:
Expand All @@ -30,23 +31,10 @@ class BFCachingAllocatorImpl {
// Number of second level bins (linearly)
static constexpr int kNumSubBins = 4;
static constexpr int kLogNumSubBins = 2;

// Allocation parameters
static constexpr size_t kMinBlockSize =
512; // all sizes are rounded to at least 512 bytes
static constexpr size_t kSmallSize =
1048576; // largest "small" allocation is 1 MiB
static constexpr size_t kSmallBuffer =
2097152; // "small" allocations are packed in 2 MiB blocks
static constexpr size_t kLargeBuffer =
20971520; // "large" allocations may be packed in 20 MiB blocks
static constexpr size_t kMinLargeAlloc =
10485760; // allocations between 1 and 10 MiB may use kLargeBuffer
static constexpr size_t kRoundLarge =
2097152; // round up large allocations to 2 MiB
static constexpr size_t kMaxSplitableBlockSize =
200 << 20; // To further reduce fragmentation, blocks >= 200MB are not
// allowed to be split
static constexpr size_t kMinAllocationSize = 512;
static constexpr size_t kMaxInternalFragmentation = 8U << 20U; // 8MB
static constexpr size_t kMinExtendSize = 8U << 20U; // 8MB

size_t cachedBytes = 0;
size_t allocatedBytes = 0;
Expand Down Expand Up @@ -79,6 +67,8 @@ class BFCachingAllocatorImpl {
__uint128_t bits = 0;
// Virtual chunks which are the heads of the bins
std::array<int, static_cast<size_t>(kNumBigBins* kNumSubBins)> binHeads_{};
// The extending size next time
size_t currExtendSize_ = kMinExtendSize;

explicit StreamSet(size_t id) : id(id) {}

Expand Down Expand Up @@ -150,11 +140,7 @@ class BFCachingAllocatorImpl {
mutable mutex_t mut_;

static size_t roundBytes(size_t nbytes) {
if (nbytes <= kMinBlockSize) {
return kMinBlockSize;
}
int clz = __builtin_clzll(nbytes - 1);
return (1LU << (sizeof(int64_t) - clz));
return ((nbytes - 1) | (kMinAllocationSize - 1)) + 1;
}

int newChunk(void* ptr, size_t size, size_t stream) {
Expand All @@ -177,7 +163,7 @@ class BFCachingAllocatorImpl {
// Big bin range:
// [2^`bigBinIdx`, 2^(`bigBinIdx`+1)), length: 2^`bigBinIdx`
// Split big bin into `kNumSubBins` sub bins
size_t nBlocks = nbytes / kMinBlockSize;
size_t nBlocks = nbytes / kMinAllocationSize;
constexpr int kMaxBinIdx = 63;
int bigBinIdx = kMaxBinIdx - __builtin_clzll(nBlocks);
// If `nbytes` is so large, we just put it into the last
Expand Down Expand Up @@ -253,22 +239,16 @@ class BFCachingAllocatorImpl {
return id;
}

void shrink(StreamSetHandle& set, size_t try_release_size = 0) {
size_t released_size = 0;
void shrink(StreamSetHandle& set) {
for (int binHead : set->binHeads_) {
int k = chunks_[binHead].nextChunkInList;
while (k) {
auto& chunk_k = chunks_[k];
if (chunk_k.isMonoBlock()) {
released_size += chunk_k.size;
releaseOnDevice(chunk_k.ptr, chunk_k.size);
if (chunks_[k].isMonoBlock()) {
releaseOnDevice(chunks_[k].ptr, chunks_[k].size);
removeChunkFromBin(k);
recycleIds_.push(k);
if (try_release_size > 0 && released_size >= try_release_size) {
break;
}
}
k = chunk_k.nextChunkInList;
k = chunks_[k].nextChunkInList;
}
}
}
Expand Down Expand Up @@ -311,39 +291,32 @@ class BFCachingAllocatorImpl {
return id;
}

size_t getAllocateSize(size_t nbytes) {
if (nbytes <= kSmallSize) {
return kSmallBuffer;
}
if (nbytes < kMinLargeAlloc) {
return kLargeBuffer;
}
return round_up_to_alignment(nbytes, kRoundLarge);
}

int extend(size_t nbytes, StreamSetHandle& set) {
size_t allocateSize = getAllocateSize(nbytes);

void* ptr = allocateOnDevice(allocateSize);
if (!ptr) {
shrink(set, allocateSize);
ptr = allocateOnDevice(allocateSize);
}
if (!ptr) {
shrink(set);
ptr = allocateOnDevice(allocateSize);
}
if (!ptr) {
if (allocateSize > nbytes) {
allocateSize = nbytes;
ptr = allocateOnDevice(allocateSize);
emptyCacheWithoutLock();
auto& extSize = set->currExtendSize_;
bool increased = false;
while (extSize < nbytes && extSize < kMaxExtendSize) {
extSize *= 2;
increased = true;
}

size_t currBytes = std::max(nbytes, extSize);
void* ptr = allocateOnDevice(currBytes);
if (ptr) {
if (!increased && extSize < kMaxExtendSize) {
extSize *= 2;
}
} else {
if (currBytes > nbytes) {
currBytes = nbytes;
ptr = allocateOnDevice(currBytes);
}
}
if (!ptr) {
return 0;
}

int id = newChunk(ptr, allocateSize, set->id);
int id = newChunk(ptr, currBytes, set->id);
return id;
}

Expand Down Expand Up @@ -398,7 +371,8 @@ class BFCachingAllocatorImpl {
}

if (id) {
if (chunks_[id].size >= (nbytes << 1)) {
if (chunks_[id].size >= nbytes * 2 ||
chunks_[id].size >= nbytes + kMaxInternalFragmentation) {
id = split(id, nbytes);
}
chunks_[id].allocated = true;
Expand Down Expand Up @@ -532,9 +506,6 @@ class BFCachingAllocator : public CacheAllocator {
: DataPtrContextBase(allocator, ptr, size), id_(id), nbytes_(nbytes) {}

~Context() {
if (size() <= 0) {
return;
}
auto allocator_ = static_cast<const BFCachingAllocator*>(allocator());
DIPU_DEBUG_ALLOCATOR(8, "BFCachingAllocator: add to async_mem_pool:"
<< ptr() << ", " << size() << " nbytes, id:"
Expand All @@ -544,22 +515,18 @@ class BFCachingAllocator : public CacheAllocator {
if (ptr()) {
allocator_->metrics_producer.deallocate(ptr());
std::deque<DIPUEvent> events;
bool record_block = false;
for (auto const& stream : streams()) {
events.emplace_back();
DIPU_DEBUG_ALLOCATOR(8, "BFCachingAllocator: record to stream:"
<< stream.rawstream());
events.back().record(stream);
record_block = true;
}
allocator_->async_mem_pool()->add(std::make_tuple(ptr(), id_),
events);
allocator_->set_memory_allocated(allocator_->memory_allocated() -
nbytes_);
if (!record_block) {
allocator_->restore();
}
}
allocator_->restore();
} else {
DIPU_DEBUG_ALLOCATOR(8,
"BFCachingAllocator:~Context: destory tensor "
Expand All @@ -570,12 +537,12 @@ class BFCachingAllocator : public CacheAllocator {

friend class Context;

c10::DataPtr allocate(size_t origin_size) const override {
c10::DataPtr allocate(size_t size) const override {
restore();
if (async_mem_pool()->size() > kMaxAsyncResourcePoolLength) {
try_empty_resource_pool();
}
size_t size = getMemoryAlignmentStrategy()->roundBytes(origin_size);
size = getMemoryAlignmentStrategy()->roundBytes(size);
std::tuple<void*, int, size_t> block = impl->allocateRaw(size);
void* ptr = std::get<0>(block);
if (ptr == nullptr && size > 0) {
Expand All @@ -601,8 +568,8 @@ class BFCachingAllocator : public CacheAllocator {
deleteBFContext, device());
DIPU_DEBUG_ALLOCATOR(
4, "BFCachingAllocator: malloc "
<< nbytes << ",requires " << origin_size
<< " nbytes, ptr:" << ptr << ",device:" << device()
<< nbytes << ",requires " << size << " nbytes, ptr:" << ptr
<< ",device:" << device()
<< ",async_mempool.size:" << async_mem_pool()->size());
c10::reportMemoryUsageToProfiler(
ptr, static_cast<int64_t>(nbytes), memory_allocated(),
Expand Down
Loading
Loading