From 5ae7345f462b9423aa3baccbf89a6fe7c8d2b78c Mon Sep 17 00:00:00 2001 From: Eduardo Souza Date: Wed, 23 Oct 2024 04:07:25 +0000 Subject: [PATCH 1/4] Adding runtime option to enable/disable inlining fastpath allocation with MMTk --- base/options.jl | 1 + src/jloptions.c | 25 ++++++ src/jloptions.h | 1 + src/llvm-late-gc-lowering.cpp | 153 +++++++++++++++++----------------- 4 files changed, 104 insertions(+), 76 deletions(-) diff --git a/base/options.jl b/base/options.jl index f535c27d99122..4ef5be7c3d1e4 100644 --- a/base/options.jl +++ b/base/options.jl @@ -33,6 +33,7 @@ struct JLOptions warn_overwrite::Int8 can_inline::Int8 polly::Int8 + mmtk_inline_fastpath::Int8 trace_compile::Ptr{UInt8} trace_dispatch::Ptr{UInt8} fast_math::Int8 diff --git a/src/jloptions.c b/src/jloptions.c index 35f0a76e3f6e7..87324d1acb193 100644 --- a/src/jloptions.c +++ b/src/jloptions.c @@ -76,6 +76,11 @@ JL_DLLEXPORT void jl_init_options(void) 0, // method overwrite warning 1, // can_inline JL_OPTIONS_POLLY_ON, // polly +#ifdef MMTK_GC + 1, // inline fastpath allocation for mmtk +#else + 0, +#endif NULL, // trace_compile NULL, // trace_dispatch JL_OPTIONS_FAST_MATH_DEFAULT, @@ -207,6 +212,10 @@ static const char opts[] = " --polly={yes*|no} Enable or disable the polyhedral optimizer Polly\n" " (overrides @polly declaration)\n" #endif +#ifdef MMTK_GC + " --inline-fastpath={yes*|no} Enable or disable inlining allocation fastpath for MMTk\n" + " during code generation.\n" +#endif // instrumentation options " --code-coverage[={none*|user|all}] Count executions of source lines (omitting setting is\n" @@ -293,6 +302,7 @@ JL_DLLEXPORT void jl_parse_opts(int *argcp, char ***argvp) opt_warn_scope, opt_inline, opt_polly, + opt_mmtk_inline_fastpath, opt_trace_compile, opt_trace_compile_timing, opt_trace_dispatch, @@ -372,6 +382,7 @@ JL_DLLEXPORT void jl_parse_opts(int *argcp, char ***argvp) { "warn-scope", required_argument, 0, opt_warn_scope }, { "inline", required_argument, 0, opt_inline }, { "polly", required_argument, 0, opt_polly }, + { "inline-fastpath", required_argument, 0, opt_mmtk_inline_fastpath }, { "trace-compile", required_argument, 0, opt_trace_compile }, { "trace-compile-timing", no_argument, 0, opt_trace_compile_timing }, { "trace-dispatch", required_argument, 0, opt_trace_dispatch }, @@ -823,6 +834,20 @@ JL_DLLEXPORT void jl_parse_opts(int *argcp, char ***argvp) jl_errorf("julia: invalid argument to --polly (%s)", optarg); } break; + case opt_mmtk_inline_fastpath: + if (!strcmp(optarg,"yes")) +#ifdef MMTK_GC + jl_options.mmtk_inline_fastpath = 1; +#else + // always set to 0 if not using MMTk + jl_options.mmtk_inline_fastpath = 0; +#endif + else if (!strcmp(optarg,"no")) + jl_options.mmtk_inline_fastpath = 0; + else { + jl_errorf("julia: invalid argument to --inline-fastpath (%s)", optarg); + } + break; case opt_trace_compile: jl_options.trace_compile = strdup(optarg); if (!jl_options.trace_compile) diff --git a/src/jloptions.h b/src/jloptions.h index e58797caace3c..f49f8e3f60ea8 100644 --- a/src/jloptions.h +++ b/src/jloptions.h @@ -37,6 +37,7 @@ typedef struct { int8_t warn_overwrite; int8_t can_inline; int8_t polly; + int8_t mmtk_inline_fastpath; const char *trace_compile; const char *trace_dispatch; int8_t fast_math; diff --git a/src/llvm-late-gc-lowering.cpp b/src/llvm-late-gc-lowering.cpp index 3201ae64cf984..c3ea82b4d6b41 100644 --- a/src/llvm-late-gc-lowering.cpp +++ b/src/llvm-late-gc-lowering.cpp @@ -2505,9 +2505,19 @@ void LateLowerGCFrame::PlaceRootsAndUpdateCalls(SmallVectorImpl &Colors, St } } -#ifdef MMTK_GC Value* LateLowerGCFrame::lowerGCAllocBytesLate(CallInst *target, Function &F) { +#ifndef MMTK_GC + // For Julia's stock GC, this option should always be 0 + assert(jl_options.mmtk_inline_fastpath == 0); +#endif + + // Setting --inline-fastpath=false with MMTk will increase allocation + // overhead a lot, and should only be used for debugging. + if (jl_options.mmtk_inline_fastpath == 0) { + return target; + } + assert(target->arg_size() == 3); IRBuilder<> builder(target); @@ -2525,78 +2535,72 @@ Value* LateLowerGCFrame::lowerGCAllocBytesLate(CallInst *target, Function &F) auto pool_osize_i32 = ConstantInt::get(Type::getInt32Ty(F.getContext()), osize); auto pool_osize = ConstantInt::get(Type::getInt64Ty(F.getContext()), osize); - // Should we generate fastpath allocation sequence here? We should always generate fastpath here for MMTk. - // Setting this to false will increase allocation overhead a lot, and should only be used for debugging. - const bool INLINE_FASTPATH_ALLOCATION = true; - - if (INLINE_FASTPATH_ALLOCATION) { - // Assuming we use the first immix allocator. - // FIXME: We should get the allocator index and type from MMTk. - auto allocator_offset = offsetof(jl_tls_states_t, gc_tls) + offsetof(jl_gc_tls_states_t, mmtk_mutator) + offsetof(MMTkMutatorContext, allocators) + offsetof(Allocators, immix); - - auto cursor_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), allocator_offset + offsetof(ImmixAllocator, cursor)); - auto limit_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), allocator_offset + offsetof(ImmixAllocator, limit)); - - auto cursor_tls_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, cursor_pos); - auto cursor_ptr = builder.CreateBitCast(cursor_tls_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "cursor_ptr"); - auto cursor = builder.CreateLoad(Type::getInt64Ty(target->getContext()), cursor_ptr, "cursor"); - - // offset = 8 - auto delta_offset = builder.CreateNSWSub(ConstantInt::get(Type::getInt64Ty(target->getContext()), 0), ConstantInt::get(Type::getInt64Ty(target->getContext()), 8)); - auto delta_cursor = builder.CreateNSWSub(ConstantInt::get(Type::getInt64Ty(target->getContext()), 0), cursor); - auto delta_op = builder.CreateNSWAdd(delta_offset, delta_cursor); - // alignment 16 (15 = 16 - 1) - auto delta = builder.CreateAnd(delta_op, ConstantInt::get(Type::getInt64Ty(target->getContext()), 15), "delta"); - auto result = builder.CreateNSWAdd(cursor, delta, "result"); - - auto new_cursor = builder.CreateNSWAdd(result, pool_osize); - - auto limit_tls_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, limit_pos); - auto limit_ptr = builder.CreateBitCast(limit_tls_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "limit_ptr"); - auto limit = builder.CreateLoad(Type::getInt64Ty(target->getContext()), limit_ptr, "limit"); - - auto gt_limit = builder.CreateICmpSGT(new_cursor, limit); - - auto slowpath = BasicBlock::Create(target->getContext(), "slowpath", target->getFunction()); - auto fastpath = BasicBlock::Create(target->getContext(), "fastpath", target->getFunction()); - - auto next_instr = target->getNextNode(); - SmallVector Weights{1, 9}; - - MDBuilder MDB(F.getContext()); - SplitBlockAndInsertIfThenElse(gt_limit, next_instr, &slowpath, &fastpath, false, false, MDB.createBranchWeights(Weights)); - - builder.SetInsertPoint(next_instr); - auto phiNode = builder.CreatePHI(target->getCalledFunction()->getReturnType(), 2, "phi_fast_slow"); - - // slowpath - builder.SetInsertPoint(slowpath); - auto pool_offs = ConstantInt::get(Type::getInt32Ty(F.getContext()), 1); - auto new_call = builder.CreateCall(smallAllocFunc, { ptls, pool_offs, pool_osize_i32, type }); - new_call->setAttributes(new_call->getCalledFunction()->getAttributes()); - builder.CreateBr(next_instr->getParent()); - - // fastpath - builder.SetInsertPoint(fastpath); - builder.CreateStore(new_cursor, cursor_ptr); - - // ptls->gc_tls.gc_num.allocd += osize; - auto pool_alloc_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), offsetof(jl_tls_states_t, gc_tls_common) + offsetof(jl_gc_tls_states_common_t, gc_num)); - auto pool_alloc_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, pool_alloc_pos); - auto pool_alloc_tls = builder.CreateBitCast(pool_alloc_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "pool_alloc"); - auto pool_allocd = builder.CreateLoad(Type::getInt64Ty(target->getContext()), pool_alloc_tls); - auto pool_allocd_total = builder.CreateAdd(pool_allocd, pool_osize); - builder.CreateStore(pool_allocd_total, pool_alloc_tls); - - auto v_raw = builder.CreateNSWAdd(result, ConstantInt::get(Type::getInt64Ty(target->getContext()), sizeof(jl_taggedvalue_t))); - auto v_as_ptr = builder.CreateIntToPtr(v_raw, smallAllocFunc->getReturnType()); - builder.CreateBr(next_instr->getParent()); - - phiNode->addIncoming(new_call, slowpath); - phiNode->addIncoming(v_as_ptr, fastpath); - phiNode->takeName(target); - return phiNode; - } + // Assuming we use the first immix allocator. + // FIXME: We should get the allocator index and type from MMTk. + auto allocator_offset = offsetof(jl_tls_states_t, gc_tls) + offsetof(jl_gc_tls_states_t, mmtk_mutator) + offsetof(MMTkMutatorContext, allocators) + offsetof(Allocators, immix); + + auto cursor_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), allocator_offset + offsetof(ImmixAllocator, cursor)); + auto limit_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), allocator_offset + offsetof(ImmixAllocator, limit)); + + auto cursor_tls_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, cursor_pos); + auto cursor_ptr = builder.CreateBitCast(cursor_tls_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "cursor_ptr"); + auto cursor = builder.CreateLoad(Type::getInt64Ty(target->getContext()), cursor_ptr, "cursor"); + + // offset = 8 + auto delta_offset = builder.CreateNSWSub(ConstantInt::get(Type::getInt64Ty(target->getContext()), 0), ConstantInt::get(Type::getInt64Ty(target->getContext()), 8)); + auto delta_cursor = builder.CreateNSWSub(ConstantInt::get(Type::getInt64Ty(target->getContext()), 0), cursor); + auto delta_op = builder.CreateNSWAdd(delta_offset, delta_cursor); + // alignment 16 (15 = 16 - 1) + auto delta = builder.CreateAnd(delta_op, ConstantInt::get(Type::getInt64Ty(target->getContext()), 15), "delta"); + auto result = builder.CreateNSWAdd(cursor, delta, "result"); + + auto new_cursor = builder.CreateNSWAdd(result, pool_osize); + + auto limit_tls_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, limit_pos); + auto limit_ptr = builder.CreateBitCast(limit_tls_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "limit_ptr"); + auto limit = builder.CreateLoad(Type::getInt64Ty(target->getContext()), limit_ptr, "limit"); + + auto gt_limit = builder.CreateICmpSGT(new_cursor, limit); + + auto slowpath = BasicBlock::Create(target->getContext(), "slowpath", target->getFunction()); + auto fastpath = BasicBlock::Create(target->getContext(), "fastpath", target->getFunction()); + + auto next_instr = target->getNextNode(); + SmallVector Weights{1, 9}; + + MDBuilder MDB(F.getContext()); + SplitBlockAndInsertIfThenElse(gt_limit, next_instr, &slowpath, &fastpath, false, false, MDB.createBranchWeights(Weights)); + + builder.SetInsertPoint(next_instr); + auto phiNode = builder.CreatePHI(target->getCalledFunction()->getReturnType(), 2, "phi_fast_slow"); + + // slowpath + builder.SetInsertPoint(slowpath); + auto pool_offs = ConstantInt::get(Type::getInt32Ty(F.getContext()), 1); + auto new_call = builder.CreateCall(smallAllocFunc, { ptls, pool_offs, pool_osize_i32, type }); + new_call->setAttributes(new_call->getCalledFunction()->getAttributes()); + builder.CreateBr(next_instr->getParent()); + + // fastpath + builder.SetInsertPoint(fastpath); + builder.CreateStore(new_cursor, cursor_ptr); + + // ptls->gc_tls.gc_num.allocd += osize; + auto pool_alloc_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), offsetof(jl_tls_states_t, gc_tls_common) + offsetof(jl_gc_tls_states_common_t, gc_num)); + auto pool_alloc_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, pool_alloc_pos); + auto pool_alloc_tls = builder.CreateBitCast(pool_alloc_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "pool_alloc"); + auto pool_allocd = builder.CreateLoad(Type::getInt64Ty(target->getContext()), pool_alloc_tls); + auto pool_allocd_total = builder.CreateAdd(pool_allocd, pool_osize); + builder.CreateStore(pool_allocd_total, pool_alloc_tls); + + auto v_raw = builder.CreateNSWAdd(result, ConstantInt::get(Type::getInt64Ty(target->getContext()), sizeof(jl_taggedvalue_t))); + auto v_as_ptr = builder.CreateIntToPtr(v_raw, smallAllocFunc->getReturnType()); + builder.CreateBr(next_instr->getParent()); + + phiNode->addIncoming(new_call, slowpath); + phiNode->addIncoming(v_as_ptr, fastpath); + phiNode->takeName(target); + return phiNode; } } return target; @@ -2616,7 +2620,6 @@ static void replaceInstruction( ++it; } } -#endif bool LateLowerGCFrame::runOnFunction(Function &F, bool *CFGModified) { initAll(*F.getParent()); @@ -2636,7 +2639,6 @@ bool LateLowerGCFrame::runOnFunction(Function &F, bool *CFGModified) { PlaceRootsAndUpdateCalls(Colors, S, CallFrames); CleanupIR(F, &S, CFGModified); -#ifdef MMTK_GC // We lower the julia.gc_alloc_bytes intrinsic in this pass to insert slowpath/fastpath blocks for MMTk for (BasicBlock &BB : F) { for (auto it = BB.begin(); it != BB.end();) { @@ -2658,7 +2660,6 @@ bool LateLowerGCFrame::runOnFunction(Function &F, bool *CFGModified) { ++it; } } -#endif return true; } From 0f32465912fa8d729ec192302ae0e5dfda5b3af4 Mon Sep 17 00:00:00 2001 From: Eduardo Souza Date: Wed, 23 Oct 2024 05:04:35 +0000 Subject: [PATCH 2/4] Iterating over all uses of GCAllocBytes instead of all instructions --- src/llvm-late-gc-lowering.cpp | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/src/llvm-late-gc-lowering.cpp b/src/llvm-late-gc-lowering.cpp index c3ea82b4d6b41..e71e39141e145 100644 --- a/src/llvm-late-gc-lowering.cpp +++ b/src/llvm-late-gc-lowering.cpp @@ -2640,24 +2640,18 @@ bool LateLowerGCFrame::runOnFunction(Function &F, bool *CFGModified) { CleanupIR(F, &S, CFGModified); // We lower the julia.gc_alloc_bytes intrinsic in this pass to insert slowpath/fastpath blocks for MMTk - for (BasicBlock &BB : F) { - for (auto it = BB.begin(); it != BB.end();) { - auto *CI = dyn_cast(&*it); - if (!CI) { - ++it; - continue; - } - - Value *callee = CI->getCalledOperand(); - assert(callee); - - auto GCAllocBytes = getOrNull(jl_intrinsics::GCAllocBytes); - if (GCAllocBytes == callee) { - *CFGModified = true; - replaceInstruction(CI, lowerGCAllocBytesLate(CI, F), it); - continue; + auto GCAllocBytes = getOrNull(jl_intrinsics::GCAllocBytes); + + if (GCAllocBytes) { + for (User *U : GCAllocBytes->users()) { + if (auto *CI = dyn_cast(U)) { + auto new_CI = lowerGCAllocBytesLate(CI, F); + if (new_CI != CI) { + *CFGModified = true; + CI->replaceAllUsesWith(new_CI); + CI->eraseFromParent(); + } } - ++it; } } From 783e8897b772b5542f0ae064e4a6f429e2c85715 Mon Sep 17 00:00:00 2001 From: Eduardo Souza Date: Wed, 23 Oct 2024 05:13:09 +0000 Subject: [PATCH 3/4] Adding warning when trying to set --inline-fastpath without MMTk --- src/jloptions.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/jloptions.c b/src/jloptions.c index 87324d1acb193..d5adcfb21e6e2 100644 --- a/src/jloptions.c +++ b/src/jloptions.c @@ -841,6 +841,7 @@ JL_DLLEXPORT void jl_parse_opts(int *argcp, char ***argvp) #else // always set to 0 if not using MMTk jl_options.mmtk_inline_fastpath = 0; + jl_printf(JL_STDERR, "WARNING: Attempting to set --inline-fastpath without using MMTk"); #endif else if (!strcmp(optarg,"no")) jl_options.mmtk_inline_fastpath = 0; From c70fcf24de0113841c558e438c56b58b705ab8ad Mon Sep 17 00:00:00 2001 From: Eduardo Souza Date: Thu, 24 Oct 2024 05:51:46 +0000 Subject: [PATCH 4/4] Revert "Iterating over all uses of GCAllocBytes instead of all instructions" This reverts commit 0f32465912fa8d729ec192302ae0e5dfda5b3af4. --- src/llvm-late-gc-lowering.cpp | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/src/llvm-late-gc-lowering.cpp b/src/llvm-late-gc-lowering.cpp index e71e39141e145..c3ea82b4d6b41 100644 --- a/src/llvm-late-gc-lowering.cpp +++ b/src/llvm-late-gc-lowering.cpp @@ -2640,18 +2640,24 @@ bool LateLowerGCFrame::runOnFunction(Function &F, bool *CFGModified) { CleanupIR(F, &S, CFGModified); // We lower the julia.gc_alloc_bytes intrinsic in this pass to insert slowpath/fastpath blocks for MMTk - auto GCAllocBytes = getOrNull(jl_intrinsics::GCAllocBytes); - - if (GCAllocBytes) { - for (User *U : GCAllocBytes->users()) { - if (auto *CI = dyn_cast(U)) { - auto new_CI = lowerGCAllocBytesLate(CI, F); - if (new_CI != CI) { - *CFGModified = true; - CI->replaceAllUsesWith(new_CI); - CI->eraseFromParent(); - } + for (BasicBlock &BB : F) { + for (auto it = BB.begin(); it != BB.end();) { + auto *CI = dyn_cast(&*it); + if (!CI) { + ++it; + continue; + } + + Value *callee = CI->getCalledOperand(); + assert(callee); + + auto GCAllocBytes = getOrNull(jl_intrinsics::GCAllocBytes); + if (GCAllocBytes == callee) { + *CFGModified = true; + replaceInstruction(CI, lowerGCAllocBytesLate(CI, F), it); + continue; } + ++it; } }