Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding runtime option to inline allocation fastpath #68

Merged
merged 4 commits into from
Oct 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions base/options.jl
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ struct JLOptions
warn_overwrite::Int8
can_inline::Int8
polly::Int8
mmtk_inline_fastpath::Int8
trace_compile::Ptr{UInt8}
trace_dispatch::Ptr{UInt8}
fast_math::Int8
Expand Down
26 changes: 26 additions & 0 deletions src/jloptions.c
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,11 @@ JL_DLLEXPORT void jl_init_options(void)
0, // method overwrite warning
1, // can_inline
JL_OPTIONS_POLLY_ON, // polly
#ifdef MMTK_GC
1, // inline fastpath allocation for mmtk
#else
0,
#endif
NULL, // trace_compile
NULL, // trace_dispatch
JL_OPTIONS_FAST_MATH_DEFAULT,
Expand Down Expand Up @@ -207,6 +212,10 @@ static const char opts[] =
" --polly={yes*|no} Enable or disable the polyhedral optimizer Polly\n"
" (overrides @polly declaration)\n"
#endif
#ifdef MMTK_GC
" --inline-fastpath={yes*|no} Enable or disable inlining allocation fastpath for MMTk\n"
" during code generation.\n"
#endif

// instrumentation options
" --code-coverage[={none*|user|all}] Count executions of source lines (omitting setting is\n"
Expand Down Expand Up @@ -293,6 +302,7 @@ JL_DLLEXPORT void jl_parse_opts(int *argcp, char ***argvp)
opt_warn_scope,
opt_inline,
opt_polly,
opt_mmtk_inline_fastpath,
opt_trace_compile,
opt_trace_compile_timing,
opt_trace_dispatch,
Expand Down Expand Up @@ -372,6 +382,7 @@ JL_DLLEXPORT void jl_parse_opts(int *argcp, char ***argvp)
{ "warn-scope", required_argument, 0, opt_warn_scope },
{ "inline", required_argument, 0, opt_inline },
{ "polly", required_argument, 0, opt_polly },
{ "inline-fastpath", required_argument, 0, opt_mmtk_inline_fastpath },
{ "trace-compile", required_argument, 0, opt_trace_compile },
{ "trace-compile-timing", no_argument, 0, opt_trace_compile_timing },
{ "trace-dispatch", required_argument, 0, opt_trace_dispatch },
Expand Down Expand Up @@ -823,6 +834,21 @@ JL_DLLEXPORT void jl_parse_opts(int *argcp, char ***argvp)
jl_errorf("julia: invalid argument to --polly (%s)", optarg);
}
break;
case opt_mmtk_inline_fastpath:
if (!strcmp(optarg,"yes"))
#ifdef MMTK_GC
jl_options.mmtk_inline_fastpath = 1;
#else
// always set to 0 if not using MMTk
jl_options.mmtk_inline_fastpath = 0;
jl_printf(JL_STDERR, "WARNING: Attempting to set --inline-fastpath without using MMTk");
#endif
else if (!strcmp(optarg,"no"))
jl_options.mmtk_inline_fastpath = 0;
else {
jl_errorf("julia: invalid argument to --inline-fastpath (%s)", optarg);
}
break;
case opt_trace_compile:
jl_options.trace_compile = strdup(optarg);
if (!jl_options.trace_compile)
Expand Down
1 change: 1 addition & 0 deletions src/jloptions.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ typedef struct {
int8_t warn_overwrite;
int8_t can_inline;
int8_t polly;
int8_t mmtk_inline_fastpath;
const char *trace_compile;
const char *trace_dispatch;
int8_t fast_math;
Expand Down
153 changes: 77 additions & 76 deletions src/llvm-late-gc-lowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2505,9 +2505,19 @@ void LateLowerGCFrame::PlaceRootsAndUpdateCalls(SmallVectorImpl<int> &Colors, St
}
}

#ifdef MMTK_GC
Value* LateLowerGCFrame::lowerGCAllocBytesLate(CallInst *target, Function &F)
{
#ifndef MMTK_GC
// For Julia's stock GC, this option should always be 0
assert(jl_options.mmtk_inline_fastpath == 0);
#endif

// Setting --inline-fastpath=false with MMTk will increase allocation
// overhead a lot, and should only be used for debugging.
if (jl_options.mmtk_inline_fastpath == 0) {
return target;
}

assert(target->arg_size() == 3);

IRBuilder<> builder(target);
Expand All @@ -2525,78 +2535,72 @@ Value* LateLowerGCFrame::lowerGCAllocBytesLate(CallInst *target, Function &F)
auto pool_osize_i32 = ConstantInt::get(Type::getInt32Ty(F.getContext()), osize);
auto pool_osize = ConstantInt::get(Type::getInt64Ty(F.getContext()), osize);

// Should we generate fastpath allocation sequence here? We should always generate fastpath here for MMTk.
// Setting this to false will increase allocation overhead a lot, and should only be used for debugging.
const bool INLINE_FASTPATH_ALLOCATION = true;

if (INLINE_FASTPATH_ALLOCATION) {
// Assuming we use the first immix allocator.
// FIXME: We should get the allocator index and type from MMTk.
auto allocator_offset = offsetof(jl_tls_states_t, gc_tls) + offsetof(jl_gc_tls_states_t, mmtk_mutator) + offsetof(MMTkMutatorContext, allocators) + offsetof(Allocators, immix);

auto cursor_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), allocator_offset + offsetof(ImmixAllocator, cursor));
auto limit_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), allocator_offset + offsetof(ImmixAllocator, limit));

auto cursor_tls_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, cursor_pos);
auto cursor_ptr = builder.CreateBitCast(cursor_tls_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "cursor_ptr");
auto cursor = builder.CreateLoad(Type::getInt64Ty(target->getContext()), cursor_ptr, "cursor");

// offset = 8
auto delta_offset = builder.CreateNSWSub(ConstantInt::get(Type::getInt64Ty(target->getContext()), 0), ConstantInt::get(Type::getInt64Ty(target->getContext()), 8));
auto delta_cursor = builder.CreateNSWSub(ConstantInt::get(Type::getInt64Ty(target->getContext()), 0), cursor);
auto delta_op = builder.CreateNSWAdd(delta_offset, delta_cursor);
// alignment 16 (15 = 16 - 1)
auto delta = builder.CreateAnd(delta_op, ConstantInt::get(Type::getInt64Ty(target->getContext()), 15), "delta");
auto result = builder.CreateNSWAdd(cursor, delta, "result");

auto new_cursor = builder.CreateNSWAdd(result, pool_osize);

auto limit_tls_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, limit_pos);
auto limit_ptr = builder.CreateBitCast(limit_tls_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "limit_ptr");
auto limit = builder.CreateLoad(Type::getInt64Ty(target->getContext()), limit_ptr, "limit");

auto gt_limit = builder.CreateICmpSGT(new_cursor, limit);

auto slowpath = BasicBlock::Create(target->getContext(), "slowpath", target->getFunction());
auto fastpath = BasicBlock::Create(target->getContext(), "fastpath", target->getFunction());

auto next_instr = target->getNextNode();
SmallVector<uint32_t, 2> Weights{1, 9};

MDBuilder MDB(F.getContext());
SplitBlockAndInsertIfThenElse(gt_limit, next_instr, &slowpath, &fastpath, false, false, MDB.createBranchWeights(Weights));

builder.SetInsertPoint(next_instr);
auto phiNode = builder.CreatePHI(target->getCalledFunction()->getReturnType(), 2, "phi_fast_slow");

// slowpath
builder.SetInsertPoint(slowpath);
auto pool_offs = ConstantInt::get(Type::getInt32Ty(F.getContext()), 1);
auto new_call = builder.CreateCall(smallAllocFunc, { ptls, pool_offs, pool_osize_i32, type });
new_call->setAttributes(new_call->getCalledFunction()->getAttributes());
builder.CreateBr(next_instr->getParent());

// fastpath
builder.SetInsertPoint(fastpath);
builder.CreateStore(new_cursor, cursor_ptr);

// ptls->gc_tls.gc_num.allocd += osize;
auto pool_alloc_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), offsetof(jl_tls_states_t, gc_tls_common) + offsetof(jl_gc_tls_states_common_t, gc_num));
auto pool_alloc_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, pool_alloc_pos);
auto pool_alloc_tls = builder.CreateBitCast(pool_alloc_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "pool_alloc");
auto pool_allocd = builder.CreateLoad(Type::getInt64Ty(target->getContext()), pool_alloc_tls);
auto pool_allocd_total = builder.CreateAdd(pool_allocd, pool_osize);
builder.CreateStore(pool_allocd_total, pool_alloc_tls);

auto v_raw = builder.CreateNSWAdd(result, ConstantInt::get(Type::getInt64Ty(target->getContext()), sizeof(jl_taggedvalue_t)));
auto v_as_ptr = builder.CreateIntToPtr(v_raw, smallAllocFunc->getReturnType());
builder.CreateBr(next_instr->getParent());

phiNode->addIncoming(new_call, slowpath);
phiNode->addIncoming(v_as_ptr, fastpath);
phiNode->takeName(target);
return phiNode;
}
// Assuming we use the first immix allocator.
// FIXME: We should get the allocator index and type from MMTk.
auto allocator_offset = offsetof(jl_tls_states_t, gc_tls) + offsetof(jl_gc_tls_states_t, mmtk_mutator) + offsetof(MMTkMutatorContext, allocators) + offsetof(Allocators, immix);

auto cursor_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), allocator_offset + offsetof(ImmixAllocator, cursor));
auto limit_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), allocator_offset + offsetof(ImmixAllocator, limit));

auto cursor_tls_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, cursor_pos);
auto cursor_ptr = builder.CreateBitCast(cursor_tls_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "cursor_ptr");
auto cursor = builder.CreateLoad(Type::getInt64Ty(target->getContext()), cursor_ptr, "cursor");

// offset = 8
auto delta_offset = builder.CreateNSWSub(ConstantInt::get(Type::getInt64Ty(target->getContext()), 0), ConstantInt::get(Type::getInt64Ty(target->getContext()), 8));
auto delta_cursor = builder.CreateNSWSub(ConstantInt::get(Type::getInt64Ty(target->getContext()), 0), cursor);
auto delta_op = builder.CreateNSWAdd(delta_offset, delta_cursor);
// alignment 16 (15 = 16 - 1)
auto delta = builder.CreateAnd(delta_op, ConstantInt::get(Type::getInt64Ty(target->getContext()), 15), "delta");
auto result = builder.CreateNSWAdd(cursor, delta, "result");

auto new_cursor = builder.CreateNSWAdd(result, pool_osize);

auto limit_tls_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, limit_pos);
auto limit_ptr = builder.CreateBitCast(limit_tls_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "limit_ptr");
auto limit = builder.CreateLoad(Type::getInt64Ty(target->getContext()), limit_ptr, "limit");

auto gt_limit = builder.CreateICmpSGT(new_cursor, limit);

auto slowpath = BasicBlock::Create(target->getContext(), "slowpath", target->getFunction());
auto fastpath = BasicBlock::Create(target->getContext(), "fastpath", target->getFunction());

auto next_instr = target->getNextNode();
SmallVector<uint32_t, 2> Weights{1, 9};

MDBuilder MDB(F.getContext());
SplitBlockAndInsertIfThenElse(gt_limit, next_instr, &slowpath, &fastpath, false, false, MDB.createBranchWeights(Weights));

builder.SetInsertPoint(next_instr);
auto phiNode = builder.CreatePHI(target->getCalledFunction()->getReturnType(), 2, "phi_fast_slow");

// slowpath
builder.SetInsertPoint(slowpath);
auto pool_offs = ConstantInt::get(Type::getInt32Ty(F.getContext()), 1);
auto new_call = builder.CreateCall(smallAllocFunc, { ptls, pool_offs, pool_osize_i32, type });
new_call->setAttributes(new_call->getCalledFunction()->getAttributes());
builder.CreateBr(next_instr->getParent());

// fastpath
builder.SetInsertPoint(fastpath);
builder.CreateStore(new_cursor, cursor_ptr);

// ptls->gc_tls.gc_num.allocd += osize;
auto pool_alloc_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), offsetof(jl_tls_states_t, gc_tls_common) + offsetof(jl_gc_tls_states_common_t, gc_num));
auto pool_alloc_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, pool_alloc_pos);
auto pool_alloc_tls = builder.CreateBitCast(pool_alloc_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "pool_alloc");
auto pool_allocd = builder.CreateLoad(Type::getInt64Ty(target->getContext()), pool_alloc_tls);
auto pool_allocd_total = builder.CreateAdd(pool_allocd, pool_osize);
builder.CreateStore(pool_allocd_total, pool_alloc_tls);

auto v_raw = builder.CreateNSWAdd(result, ConstantInt::get(Type::getInt64Ty(target->getContext()), sizeof(jl_taggedvalue_t)));
auto v_as_ptr = builder.CreateIntToPtr(v_raw, smallAllocFunc->getReturnType());
builder.CreateBr(next_instr->getParent());

phiNode->addIncoming(new_call, slowpath);
phiNode->addIncoming(v_as_ptr, fastpath);
phiNode->takeName(target);
return phiNode;
}
}
return target;
Expand All @@ -2616,7 +2620,6 @@ static void replaceInstruction(
++it;
}
}
#endif

bool LateLowerGCFrame::runOnFunction(Function &F, bool *CFGModified) {
initAll(*F.getParent());
Expand All @@ -2636,7 +2639,6 @@ bool LateLowerGCFrame::runOnFunction(Function &F, bool *CFGModified) {
PlaceRootsAndUpdateCalls(Colors, S, CallFrames);
CleanupIR(F, &S, CFGModified);

#ifdef MMTK_GC
// We lower the julia.gc_alloc_bytes intrinsic in this pass to insert slowpath/fastpath blocks for MMTk
for (BasicBlock &BB : F) {
for (auto it = BB.begin(); it != BB.end();) {
Expand All @@ -2658,7 +2660,6 @@ bool LateLowerGCFrame::runOnFunction(Function &F, bool *CFGModified) {
++it;
}
}
#endif

return true;
}
Expand Down