From b7bd8bd5b82e022448d1cc235157d2f660dff7c7 Mon Sep 17 00:00:00 2001 From: Eduardo Souza Date: Wed, 18 Jan 2023 04:45:09 +0000 Subject: [PATCH 1/2] Removing julia's bigvalue_t and moving the decision to allocate large objects to MMTk --- src/array.c | 17 ++-- src/julia_internal.h | 17 ++-- src/llvm-final-gc-lowering.cpp | 141 +++++++++++++++++---------------- src/llvm-pass-helpers.cpp | 2 +- 4 files changed, 87 insertions(+), 90 deletions(-) diff --git a/src/array.c b/src/array.c index 7a84af56d5922..4c164b67c0e7b 100644 --- a/src/array.c +++ b/src/array.c @@ -503,29 +503,28 @@ JL_DLLEXPORT jl_value_t *jl_alloc_string(size_t len) jl_value_t *s; jl_ptls_t ptls = ct->ptls; const size_t allocsz = sz + sizeof(jl_taggedvalue_t); - if (sz <= GC_MAX_SZCLASS) { #ifndef MMTKHEAP + if (sz <= GC_MAX_SZCLASS) { int pool_id = jl_gc_szclass_align8(allocsz); jl_gc_pool_t *p = &ptls->heap.norm_pools[pool_id]; int osize = jl_gc_sizeclasses[pool_id]; // We call `jl_gc_pool_alloc_noinline` instead of `jl_gc_pool_alloc` to avoid double-counting in // the Allocations Profiler. (See https://github.com/JuliaLang/julia/pull/43868 for more details.) s = jl_gc_pool_alloc_noinline(ptls, (char*)p - (char*)ptls, osize); -#else - int pool_id = jl_gc_szclass_align8(allocsz); - int osize = jl_gc_sizeclasses[pool_id]; - s = jl_mmtk_gc_alloc_default(ptls, pool_id, osize, jl_string_type); -#endif } else { if (allocsz < sz) // overflow in adding offs, size was "negative" jl_throw(jl_memory_exception); -#ifndef MMTKHEAP + s = jl_gc_big_alloc_noinline(ptls, allocsz); + } #else - s = jl_mmtk_gc_alloc_big(ptls, allocsz); + if (allocsz < sz) // overflow in adding offs, size was "negative" + jl_throw(jl_memory_exception); + + s = jl_mmtk_gc_alloc_default(ptls, allocsz, jl_string_type); #endif - } + jl_set_typeof(s, jl_string_type); maybe_record_alloc_to_profile(s, len, jl_string_type); *(size_t*)s = len; diff --git a/src/julia_internal.h b/src/julia_internal.h index c29cb335b4def..ed77fa58c82f9 100644 --- a/src/julia_internal.h +++ b/src/julia_internal.h @@ -239,7 +239,7 @@ jl_value_t *jl_gc_pool_alloc_noinline(jl_ptls_t ptls, int pool_offset, int osize); jl_value_t *jl_gc_big_alloc_noinline(jl_ptls_t ptls, size_t allocsz); #ifdef MMTKHEAP -JL_DLLEXPORT jl_value_t *jl_mmtk_gc_alloc_default(jl_ptls_t ptls, int pool_offset, int osize, void* ty); +JL_DLLEXPORT jl_value_t *jl_mmtk_gc_alloc_default(jl_ptls_t ptls, size_t size, void* ty); JL_DLLEXPORT jl_value_t *jl_mmtk_gc_alloc_big(jl_ptls_t ptls, size_t allocsz); #endif JL_DLLEXPORT int jl_gc_classify_pools(size_t sz, int *osize); @@ -364,29 +364,26 @@ STATIC_INLINE jl_value_t *jl_gc_alloc_(jl_ptls_t ptls, size_t sz, void *ty) { jl_value_t *v; const size_t allocsz = sz + sizeof(jl_taggedvalue_t); - if (sz <= GC_MAX_SZCLASS) { #ifndef MMTKHEAP + if (sz <= GC_MAX_SZCLASS) { int pool_id = jl_gc_szclass(allocsz); jl_gc_pool_t *p = &ptls->heap.norm_pools[pool_id]; int osize = jl_gc_sizeclasses[pool_id]; // We call `jl_gc_pool_alloc_noinline` instead of `jl_gc_pool_alloc` to avoid double-counting in // the Allocations Profiler. (See https://github.com/JuliaLang/julia/pull/43868 for more details.) v = jl_gc_pool_alloc_noinline(ptls, (char*)p - (char*)ptls, osize); -#else - int pool_id = jl_gc_szclass(allocsz); - int osize = jl_gc_sizeclasses[pool_id]; - v = jl_mmtk_gc_alloc_default(ptls, pool_id, osize, ty); -#endif } else { if (allocsz < sz) // overflow in adding offs, size was "negative" jl_throw(jl_memory_exception); -#ifndef MMTKHEAP v = jl_gc_big_alloc_noinline(ptls, allocsz); + } #else - v = jl_mmtk_gc_alloc_big(ptls, allocsz); + if (allocsz < sz) // overflow in adding offs, size was "negative" + jl_throw(jl_memory_exception); + + v = jl_mmtk_gc_alloc_default(ptls, allocsz, ty); #endif - } jl_set_typeof(v, ty); maybe_record_alloc_to_profile(v, sz, (jl_datatype_t*)ty); return v; diff --git a/src/llvm-final-gc-lowering.cpp b/src/llvm-final-gc-lowering.cpp index d19505cbe16e5..a59387155168e 100644 --- a/src/llvm-final-gc-lowering.cpp +++ b/src/llvm-final-gc-lowering.cpp @@ -171,93 +171,94 @@ Value *FinalLowerGC::lowerGCAllocBytes(CallInst *target, Function &F) { assert(target->arg_size() == 2); auto sz = (size_t)cast(target->getArgOperand(1))->getZExtValue(); - // This is strongly architecture and OS dependent - int osize; - int offset = jl_gc_classify_pools(sz, &osize); + IRBuilder<> builder(target); builder.SetCurrentDebugLocation(target->getDebugLoc()); auto ptls = target->getArgOperand(0); +#ifndef MMTKHEAP CallInst *newI; + // This is strongly architecture and OS dependent + int osize; + int offset = jl_gc_classify_pools(sz, &osize); if (offset < 0) { newI = builder.CreateCall( bigAllocFunc, { ptls, ConstantInt::get(T_size, sz + sizeof(void*)) }); } else { -#ifndef MMTKHEAP auto pool_offs = ConstantInt::get(Type::getInt32Ty(F.getContext()), offset); auto pool_osize = ConstantInt::get(Type::getInt32Ty(F.getContext()), osize); newI = builder.CreateCall(poolAllocFunc, { ptls, pool_offs, pool_osize }); -#else - auto pool_osize = ConstantInt::get(Type::getInt64Ty(F.getContext()), osize); - auto cursor_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), offsetof(jl_tls_states_t, cursor)); - auto limit_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), offsetof(jl_tls_states_t, limit)); - - auto cursor_tls_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, cursor_pos); - auto cursor_ptr = builder.CreateBitCast(cursor_tls_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "cursor_ptr"); - auto cursor = builder.CreateLoad(Type::getInt64Ty(target->getContext()), cursor_ptr, "cursor"); - - - auto delta_offset = builder.CreateNSWSub(ConstantInt::get(Type::getInt64Ty(target->getContext()), 0), ConstantInt::get(Type::getInt64Ty(target->getContext()), 8)); - auto delta_cursor = builder.CreateNSWSub(ConstantInt::get(Type::getInt64Ty(target->getContext()), 0), cursor); - auto delta_op = builder.CreateNSWAdd(delta_offset, delta_cursor); - auto delta = builder.CreateAnd(delta_op, ConstantInt::get(Type::getInt64Ty(target->getContext()), 15), "delta"); - auto result = builder.CreateNSWAdd(cursor, delta, "result"); - - auto new_cursor = builder.CreateNSWAdd(result, pool_osize); - - auto limit_tls_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, limit_pos); - auto limit_ptr = builder.CreateBitCast(limit_tls_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "limit_ptr"); - auto limit = builder.CreateLoad(Type::getInt64Ty(target->getContext()), limit_ptr, "limit"); - - auto gt_limit = builder.CreateICmpSGT(new_cursor, limit); - - auto current_block = target->getParent(); - builder.SetInsertPoint(target->getNextNode()); - auto phiNode = builder.CreatePHI(poolAllocFunc->getReturnType(), 2, "phi_fast_slow"); - auto top_cont = current_block->splitBasicBlock(target->getNextNode(), "top_cont"); - - auto slowpath = BasicBlock::Create(target->getContext(), "slowpath", target->getFunction()); - auto fastpath = BasicBlock::Create(target->getContext(), "fastpath", target->getFunction(), top_cont); - - auto next_br = current_block->getTerminator(); - next_br->eraseFromParent(); - builder.SetInsertPoint(current_block); - builder.CreateCondBr(gt_limit, slowpath, fastpath); - - // slowpath - builder.SetInsertPoint(slowpath); - auto pool_offs = ConstantInt::get(Type::getInt32Ty(F.getContext()), 1); - auto new_call = builder.CreateCall(poolAllocFunc, { pool_offs, pool_osize }); - new_call->setAttributes(new_call->getCalledFunction()->getAttributes()); - builder.CreateBr(top_cont); - - // // fastpath - builder.SetInsertPoint(fastpath); - builder.CreateStore(new_cursor, cursor_ptr); - - // ptls->gc_num.allocd += osize; - auto pool_alloc_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), offsetof(jl_tls_states_t, gc_num)); - auto pool_alloc_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, pool_alloc_pos); - auto pool_alloc_tls = builder.CreateBitCast(pool_alloc_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "pool_alloc"); - auto pool_allocd = builder.CreateLoad(Type::getInt64Ty(target->getContext()), pool_alloc_tls); - auto pool_allocd_total = builder.CreateAdd(pool_allocd, pool_osize); - builder.CreateStore(pool_allocd_total, pool_alloc_tls); - - auto v_raw = builder.CreateNSWAdd(result, ConstantInt::get(Type::getInt64Ty(target->getContext()), sizeof(jl_taggedvalue_t))); - auto v_as_ptr = builder.CreateIntToPtr(v_raw, poolAllocFunc->getReturnType()); - builder.CreateBr(top_cont); - - phiNode->addIncoming(new_call, slowpath); - phiNode->addIncoming(v_as_ptr, fastpath); - phiNode->takeName(target); - - return phiNode; -#endif } newI->setAttributes(newI->getCalledFunction()->getAttributes()); newI->takeName(target); return newI; +#else + auto pool_osize = ConstantInt::get(T_size, sz + sizeof(void*)); + auto cursor_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), offsetof(jl_tls_states_t, cursor)); + auto limit_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), offsetof(jl_tls_states_t, limit)); + + auto cursor_tls_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, cursor_pos); + auto cursor_ptr = builder.CreateBitCast(cursor_tls_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "cursor_ptr"); + auto cursor = builder.CreateLoad(Type::getInt64Ty(target->getContext()), cursor_ptr, "cursor"); + + + auto delta_offset = builder.CreateNSWSub(ConstantInt::get(Type::getInt64Ty(target->getContext()), 0), ConstantInt::get(Type::getInt64Ty(target->getContext()), 8)); + auto delta_cursor = builder.CreateNSWSub(ConstantInt::get(Type::getInt64Ty(target->getContext()), 0), cursor); + auto delta_op = builder.CreateNSWAdd(delta_offset, delta_cursor); + auto delta = builder.CreateAnd(delta_op, ConstantInt::get(Type::getInt64Ty(target->getContext()), 15), "delta"); + auto result = builder.CreateNSWAdd(cursor, delta, "result"); + + auto new_cursor = builder.CreateNSWAdd(result, pool_osize); + + auto limit_tls_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, limit_pos); + auto limit_ptr = builder.CreateBitCast(limit_tls_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "limit_ptr"); + auto limit = builder.CreateLoad(Type::getInt64Ty(target->getContext()), limit_ptr, "limit"); + + auto gt_limit = builder.CreateICmpSGT(new_cursor, limit); + + auto current_block = target->getParent(); + builder.SetInsertPoint(target->getNextNode()); + auto phiNode = builder.CreatePHI(poolAllocFunc->getReturnType(), 2, "phi_fast_slow"); + auto top_cont = current_block->splitBasicBlock(target->getNextNode(), "top_cont"); + + auto slowpath = BasicBlock::Create(target->getContext(), "slowpath", target->getFunction()); + auto fastpath = BasicBlock::Create(target->getContext(), "fastpath", target->getFunction(), top_cont); + + auto next_br = current_block->getTerminator(); + next_br->eraseFromParent(); + builder.SetInsertPoint(current_block); + builder.CreateCondBr(gt_limit, slowpath, fastpath); + + // slowpath + builder.SetInsertPoint(slowpath); + auto pool_offs = ConstantInt::get(Type::getInt32Ty(F.getContext()), 1); + auto new_call = builder.CreateCall(poolAllocFunc, { pool_offs, pool_osize }); + new_call->setAttributes(new_call->getCalledFunction()->getAttributes()); + builder.CreateBr(top_cont); + + // // fastpath + builder.SetInsertPoint(fastpath); + builder.CreateStore(new_cursor, cursor_ptr); + + // ptls->gc_num.allocd += osize; + auto pool_alloc_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), offsetof(jl_tls_states_t, gc_num)); + auto pool_alloc_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, pool_alloc_pos); + auto pool_alloc_tls = builder.CreateBitCast(pool_alloc_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "pool_alloc"); + auto pool_allocd = builder.CreateLoad(Type::getInt64Ty(target->getContext()), pool_alloc_tls); + auto pool_allocd_total = builder.CreateAdd(pool_allocd, pool_osize); + builder.CreateStore(pool_allocd_total, pool_alloc_tls); + + auto v_raw = builder.CreateNSWAdd(result, ConstantInt::get(Type::getInt64Ty(target->getContext()), sizeof(jl_taggedvalue_t))); + auto v_as_ptr = builder.CreateIntToPtr(v_raw, poolAllocFunc->getReturnType()); + builder.CreateBr(top_cont); + + phiNode->addIncoming(new_call, slowpath); + phiNode->addIncoming(v_as_ptr, fastpath); + phiNode->takeName(target); + + return phiNode; +#endif } bool FinalLowerGC::doInitialization(Module &M) { diff --git a/src/llvm-pass-helpers.cpp b/src/llvm-pass-helpers.cpp index 7cf42abe4b2f1..3eac6963a1338 100644 --- a/src/llvm-pass-helpers.cpp +++ b/src/llvm-pass-helpers.cpp @@ -247,7 +247,7 @@ namespace jl_well_known { #ifndef MMTKHEAP { Type::getInt8PtrTy(context.getLLVMContext()), Type::getInt32Ty(context.getLLVMContext()), Type::getInt32Ty(context.getLLVMContext()) }, #else - { Type::getInt32Ty(context.getLLVMContext()), Type::getInt64Ty(context.getLLVMContext()) }, + { Type::getInt32Ty(context.getLLVMContext()), context.T_size }, #endif false), Function::ExternalLinkage, From 7152637c15c5fac7adab51269aaa5fabe2ff8273 Mon Sep 17 00:00:00 2001 From: Luis Eduardo de Souza Amorim Date: Tue, 24 Jan 2023 06:20:52 +0000 Subject: [PATCH 2/2] Setting up allocation threshold as immix's MAX_IMMIX_OBJECT_SIZE --- src/array.c | 13 ++- src/julia_internal.h | 7 +- src/llvm-final-gc-lowering.cpp | 140 ++++++++++++++++++--------------- 3 files changed, 86 insertions(+), 74 deletions(-) diff --git a/src/array.c b/src/array.c index 4c164b67c0e7b..24ebbc10c485f 100644 --- a/src/array.c +++ b/src/array.c @@ -471,13 +471,6 @@ JL_DLLEXPORT jl_value_t *jl_array_to_string(jl_array_t *a) ((a->maxsize + sizeof(void*) + 1 <= GC_MAX_SZCLASS) == (len + sizeof(void*) + 1 <= GC_MAX_SZCLASS)))) { jl_value_t *o = jl_array_data_owner(a); if (jl_is_string(o)) { -#ifdef MMTKHEAP - // since we need the size of the string to be accurate according to its allocation size, we simply allocate a new string here - // instead of changing its size to len as in `*(size_t*)o = len` - o = jl_gc_realloc_string(o, len); - jl_value_t** owner_addr = (a + jl_array_data_owner_offset(jl_array_ndims(a))); - owner_addr = o; -#endif a->flags.isshared = 1; *(size_t*)o = len; a->nrows = 0; @@ -522,7 +515,11 @@ JL_DLLEXPORT jl_value_t *jl_alloc_string(size_t len) if (allocsz < sz) // overflow in adding offs, size was "negative" jl_throw(jl_memory_exception); - s = jl_mmtk_gc_alloc_default(ptls, allocsz, jl_string_type); + if (allocsz < MAX_STANDARD_OBJECT_SIZE) { + s = jl_mmtk_gc_alloc_default(ptls, allocsz, jl_string_type); + } else { + s = jl_mmtk_gc_alloc_big(ptls, allocsz); + } #endif jl_set_typeof(s, jl_string_type); diff --git a/src/julia_internal.h b/src/julia_internal.h index ed77fa58c82f9..984acd6dcf888 100644 --- a/src/julia_internal.h +++ b/src/julia_internal.h @@ -239,6 +239,7 @@ jl_value_t *jl_gc_pool_alloc_noinline(jl_ptls_t ptls, int pool_offset, int osize); jl_value_t *jl_gc_big_alloc_noinline(jl_ptls_t ptls, size_t allocsz); #ifdef MMTKHEAP +extern size_t MAX_STANDARD_OBJECT_SIZE; JL_DLLEXPORT jl_value_t *jl_mmtk_gc_alloc_default(jl_ptls_t ptls, size_t size, void* ty); JL_DLLEXPORT jl_value_t *jl_mmtk_gc_alloc_big(jl_ptls_t ptls, size_t allocsz); #endif @@ -382,7 +383,11 @@ STATIC_INLINE jl_value_t *jl_gc_alloc_(jl_ptls_t ptls, size_t sz, void *ty) if (allocsz < sz) // overflow in adding offs, size was "negative" jl_throw(jl_memory_exception); - v = jl_mmtk_gc_alloc_default(ptls, allocsz, ty); + if (allocsz < (MAX_STANDARD_OBJECT_SIZE - 8)) { // buffer may take 8 bytes extra + v = jl_mmtk_gc_alloc_default(ptls, allocsz, ty); + } else { + v = jl_mmtk_gc_alloc_big(ptls, allocsz); + } #endif jl_set_typeof(v, ty); maybe_record_alloc_to_profile(v, sz, (jl_datatype_t*)ty); diff --git a/src/llvm-final-gc-lowering.cpp b/src/llvm-final-gc-lowering.cpp index a59387155168e..2e0a64737ea05 100644 --- a/src/llvm-final-gc-lowering.cpp +++ b/src/llvm-final-gc-lowering.cpp @@ -175,8 +175,8 @@ Value *FinalLowerGC::lowerGCAllocBytes(CallInst *target, Function &F) IRBuilder<> builder(target); builder.SetCurrentDebugLocation(target->getDebugLoc()); auto ptls = target->getArgOperand(0); -#ifndef MMTKHEAP CallInst *newI; +#ifndef MMTKHEAP // This is strongly architecture and OS dependent int osize; int offset = jl_gc_classify_pools(sz, &osize); @@ -194,70 +194,80 @@ Value *FinalLowerGC::lowerGCAllocBytes(CallInst *target, Function &F) newI->takeName(target); return newI; #else - auto pool_osize = ConstantInt::get(T_size, sz + sizeof(void*)); - auto cursor_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), offsetof(jl_tls_states_t, cursor)); - auto limit_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), offsetof(jl_tls_states_t, limit)); - - auto cursor_tls_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, cursor_pos); - auto cursor_ptr = builder.CreateBitCast(cursor_tls_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "cursor_ptr"); - auto cursor = builder.CreateLoad(Type::getInt64Ty(target->getContext()), cursor_ptr, "cursor"); - - - auto delta_offset = builder.CreateNSWSub(ConstantInt::get(Type::getInt64Ty(target->getContext()), 0), ConstantInt::get(Type::getInt64Ty(target->getContext()), 8)); - auto delta_cursor = builder.CreateNSWSub(ConstantInt::get(Type::getInt64Ty(target->getContext()), 0), cursor); - auto delta_op = builder.CreateNSWAdd(delta_offset, delta_cursor); - auto delta = builder.CreateAnd(delta_op, ConstantInt::get(Type::getInt64Ty(target->getContext()), 15), "delta"); - auto result = builder.CreateNSWAdd(cursor, delta, "result"); - - auto new_cursor = builder.CreateNSWAdd(result, pool_osize); - - auto limit_tls_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, limit_pos); - auto limit_ptr = builder.CreateBitCast(limit_tls_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "limit_ptr"); - auto limit = builder.CreateLoad(Type::getInt64Ty(target->getContext()), limit_ptr, "limit"); - - auto gt_limit = builder.CreateICmpSGT(new_cursor, limit); - - auto current_block = target->getParent(); - builder.SetInsertPoint(target->getNextNode()); - auto phiNode = builder.CreatePHI(poolAllocFunc->getReturnType(), 2, "phi_fast_slow"); - auto top_cont = current_block->splitBasicBlock(target->getNextNode(), "top_cont"); - - auto slowpath = BasicBlock::Create(target->getContext(), "slowpath", target->getFunction()); - auto fastpath = BasicBlock::Create(target->getContext(), "fastpath", target->getFunction(), top_cont); - - auto next_br = current_block->getTerminator(); - next_br->eraseFromParent(); - builder.SetInsertPoint(current_block); - builder.CreateCondBr(gt_limit, slowpath, fastpath); - - // slowpath - builder.SetInsertPoint(slowpath); - auto pool_offs = ConstantInt::get(Type::getInt32Ty(F.getContext()), 1); - auto new_call = builder.CreateCall(poolAllocFunc, { pool_offs, pool_osize }); - new_call->setAttributes(new_call->getCalledFunction()->getAttributes()); - builder.CreateBr(top_cont); - - // // fastpath - builder.SetInsertPoint(fastpath); - builder.CreateStore(new_cursor, cursor_ptr); - - // ptls->gc_num.allocd += osize; - auto pool_alloc_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), offsetof(jl_tls_states_t, gc_num)); - auto pool_alloc_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, pool_alloc_pos); - auto pool_alloc_tls = builder.CreateBitCast(pool_alloc_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "pool_alloc"); - auto pool_allocd = builder.CreateLoad(Type::getInt64Ty(target->getContext()), pool_alloc_tls); - auto pool_allocd_total = builder.CreateAdd(pool_allocd, pool_osize); - builder.CreateStore(pool_allocd_total, pool_alloc_tls); - - auto v_raw = builder.CreateNSWAdd(result, ConstantInt::get(Type::getInt64Ty(target->getContext()), sizeof(jl_taggedvalue_t))); - auto v_as_ptr = builder.CreateIntToPtr(v_raw, poolAllocFunc->getReturnType()); - builder.CreateBr(top_cont); - - phiNode->addIncoming(new_call, slowpath); - phiNode->addIncoming(v_as_ptr, fastpath); - phiNode->takeName(target); - - return phiNode; + if (sz + sizeof(void*) >= MAX_STANDARD_OBJECT_SIZE) { + newI = builder.CreateCall( + bigAllocFunc, + { ptls, ConstantInt::get(T_size, sz + sizeof(void*)) }); + + newI->setAttributes(newI->getCalledFunction()->getAttributes()); + newI->takeName(target); + return newI; + } else { + auto pool_osize = ConstantInt::get(T_size, sz + sizeof(void*)); + auto cursor_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), offsetof(jl_tls_states_t, cursor)); + auto limit_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), offsetof(jl_tls_states_t, limit)); + + auto cursor_tls_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, cursor_pos); + auto cursor_ptr = builder.CreateBitCast(cursor_tls_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "cursor_ptr"); + auto cursor = builder.CreateLoad(Type::getInt64Ty(target->getContext()), cursor_ptr, "cursor"); + + + auto delta_offset = builder.CreateNSWSub(ConstantInt::get(Type::getInt64Ty(target->getContext()), 0), ConstantInt::get(Type::getInt64Ty(target->getContext()), 8)); + auto delta_cursor = builder.CreateNSWSub(ConstantInt::get(Type::getInt64Ty(target->getContext()), 0), cursor); + auto delta_op = builder.CreateNSWAdd(delta_offset, delta_cursor); + auto delta = builder.CreateAnd(delta_op, ConstantInt::get(Type::getInt64Ty(target->getContext()), 15), "delta"); + auto result = builder.CreateNSWAdd(cursor, delta, "result"); + + auto new_cursor = builder.CreateNSWAdd(result, pool_osize); + + auto limit_tls_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, limit_pos); + auto limit_ptr = builder.CreateBitCast(limit_tls_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "limit_ptr"); + auto limit = builder.CreateLoad(Type::getInt64Ty(target->getContext()), limit_ptr, "limit"); + + auto gt_limit = builder.CreateICmpSGT(new_cursor, limit); + + auto current_block = target->getParent(); + builder.SetInsertPoint(target->getNextNode()); + auto phiNode = builder.CreatePHI(poolAllocFunc->getReturnType(), 2, "phi_fast_slow"); + auto top_cont = current_block->splitBasicBlock(target->getNextNode(), "top_cont"); + + auto slowpath = BasicBlock::Create(target->getContext(), "slowpath", target->getFunction()); + auto fastpath = BasicBlock::Create(target->getContext(), "fastpath", target->getFunction(), top_cont); + + auto next_br = current_block->getTerminator(); + next_br->eraseFromParent(); + builder.SetInsertPoint(current_block); + builder.CreateCondBr(gt_limit, slowpath, fastpath); + + // slowpath + builder.SetInsertPoint(slowpath); + auto pool_offs = ConstantInt::get(Type::getInt32Ty(F.getContext()), 1); + auto new_call = builder.CreateCall(poolAllocFunc, { pool_offs, pool_osize }); + new_call->setAttributes(new_call->getCalledFunction()->getAttributes()); + builder.CreateBr(top_cont); + + // // fastpath + builder.SetInsertPoint(fastpath); + builder.CreateStore(new_cursor, cursor_ptr); + + // ptls->gc_num.allocd += osize; + auto pool_alloc_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), offsetof(jl_tls_states_t, gc_num)); + auto pool_alloc_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, pool_alloc_pos); + auto pool_alloc_tls = builder.CreateBitCast(pool_alloc_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "pool_alloc"); + auto pool_allocd = builder.CreateLoad(Type::getInt64Ty(target->getContext()), pool_alloc_tls); + auto pool_allocd_total = builder.CreateAdd(pool_allocd, pool_osize); + builder.CreateStore(pool_allocd_total, pool_alloc_tls); + + auto v_raw = builder.CreateNSWAdd(result, ConstantInt::get(Type::getInt64Ty(target->getContext()), sizeof(jl_taggedvalue_t))); + auto v_as_ptr = builder.CreateIntToPtr(v_raw, poolAllocFunc->getReturnType()); + builder.CreateBr(top_cont); + + phiNode->addIncoming(new_call, slowpath); + phiNode->addIncoming(v_as_ptr, fastpath); + phiNode->takeName(target); + + return phiNode; + } #endif }