mmtk · udesou · Oct 24, 2024 · Oct 23, 2024 · Oct 23, 2024 · Oct 23, 2024
diff --git a/base/options.jl b/base/options.jl
@@ -33,6 +33,7 @@ struct JLOptions
     warn_overwrite::Int8
     can_inline::Int8
     polly::Int8
+    mmtk_inline_fastpath::Int8
     trace_compile::Ptr{UInt8}
     trace_dispatch::Ptr{UInt8}
     fast_math::Int8

diff --git a/src/jloptions.c b/src/jloptions.c
@@ -76,6 +76,11 @@ JL_DLLEXPORT void jl_init_options(void)
                         0,    // method overwrite warning
                         1,    // can_inline
                         JL_OPTIONS_POLLY_ON, // polly
+#ifdef MMTK_GC
+                        1,    // inline fastpath allocation for mmtk
+#else
+                        0,
+#endif
                         NULL, // trace_compile
                         NULL, // trace_dispatch
                         JL_OPTIONS_FAST_MATH_DEFAULT,
@@ -207,6 +212,10 @@ static const char opts[]  =
     " --polly={yes*|no}                             Enable or disable the polyhedral optimizer Polly\n"
     "                                               (overrides @polly declaration)\n"
 #endif
+#ifdef MMTK_GC
+    " --inline-fastpath={yes*|no}                   Enable or disable inlining allocation fastpath for MMTk\n"
+    "                                               during code generation.\n"
+#endif
 
     // instrumentation options
     " --code-coverage[={none*|user|all}]            Count executions of source lines (omitting setting is\n"
@@ -293,6 +302,7 @@ JL_DLLEXPORT void jl_parse_opts(int *argcp, char ***argvp)
            opt_warn_scope,
            opt_inline,
            opt_polly,
+           opt_mmtk_inline_fastpath,
            opt_trace_compile,
            opt_trace_compile_timing,
            opt_trace_dispatch,
@@ -372,6 +382,7 @@ JL_DLLEXPORT void jl_parse_opts(int *argcp, char ***argvp)
         { "warn-scope",      required_argument, 0, opt_warn_scope },
         { "inline",          required_argument, 0, opt_inline },
         { "polly",           required_argument, 0, opt_polly },
+        { "inline-fastpath", required_argument, 0, opt_mmtk_inline_fastpath },
         { "trace-compile",   required_argument, 0, opt_trace_compile },
         { "trace-compile-timing",  no_argument, 0, opt_trace_compile_timing },
         { "trace-dispatch",  required_argument, 0, opt_trace_dispatch },
@@ -823,6 +834,21 @@ JL_DLLEXPORT void jl_parse_opts(int *argcp, char ***argvp)
                 jl_errorf("julia: invalid argument to --polly (%s)", optarg);
             }
             break;
+        case opt_mmtk_inline_fastpath:
+            if (!strcmp(optarg,"yes"))
+#ifdef MMTK_GC
+                jl_options.mmtk_inline_fastpath = 1;
+#else
+                // always set to 0 if not using MMTk
+                jl_options.mmtk_inline_fastpath = 0;
+                jl_printf(JL_STDERR, "WARNING: Attempting to set --inline-fastpath without using MMTk");
+#endif
+            else if (!strcmp(optarg,"no"))
+                jl_options.mmtk_inline_fastpath = 0;
+            else {
+                jl_errorf("julia: invalid argument to --inline-fastpath (%s)", optarg);
+            }
+            break;
         case opt_trace_compile:
             jl_options.trace_compile = strdup(optarg);
             if (!jl_options.trace_compile)

diff --git a/src/jloptions.h b/src/jloptions.h
@@ -37,6 +37,7 @@ typedef struct {
     int8_t warn_overwrite;
     int8_t can_inline;
     int8_t polly;
+    int8_t mmtk_inline_fastpath;
     const char *trace_compile;
     const char *trace_dispatch;
     int8_t fast_math;

diff --git a/src/llvm-late-gc-lowering.cpp b/src/llvm-late-gc-lowering.cpp
@@ -2505,9 +2505,19 @@ void LateLowerGCFrame::PlaceRootsAndUpdateCalls(SmallVectorImpl<int> &Colors, St
     }
 }
 
-#ifdef MMTK_GC
 Value* LateLowerGCFrame::lowerGCAllocBytesLate(CallInst *target, Function &F)
 {
+#ifndef MMTK_GC
+    // For Julia's stock GC, this option should always be 0
+    assert(jl_options.mmtk_inline_fastpath == 0);
+#endif
+
+    // Setting --inline-fastpath=false with MMTk will increase allocation
+    // overhead a lot, and should only be used for debugging.
+    if (jl_options.mmtk_inline_fastpath == 0) {
+        return target;
+    }
+
     assert(target->arg_size() == 3);
 
     IRBuilder<> builder(target);
@@ -2525,78 +2535,72 @@ Value* LateLowerGCFrame::lowerGCAllocBytesLate(CallInst *target, Function &F)
             auto pool_osize_i32 = ConstantInt::get(Type::getInt32Ty(F.getContext()), osize);
             auto pool_osize = ConstantInt::get(Type::getInt64Ty(F.getContext()), osize);
 
-            // Should we generate fastpath allocation sequence here? We should always generate fastpath here for MMTk.
-            // Setting this to false will increase allocation overhead a lot, and should only be used for debugging.
-            const bool INLINE_FASTPATH_ALLOCATION = true;
-
-            if (INLINE_FASTPATH_ALLOCATION) {
-                // Assuming we use the first immix allocator.
-                // FIXME: We should get the allocator index and type from MMTk.
-                auto allocator_offset = offsetof(jl_tls_states_t, gc_tls) + offsetof(jl_gc_tls_states_t, mmtk_mutator) + offsetof(MMTkMutatorContext, allocators) + offsetof(Allocators, immix);
-
-                auto cursor_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), allocator_offset + offsetof(ImmixAllocator, cursor));
-                auto limit_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()),  allocator_offset + offsetof(ImmixAllocator, limit));
-
-                auto cursor_tls_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, cursor_pos);
-                auto cursor_ptr = builder.CreateBitCast(cursor_tls_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "cursor_ptr");
-                auto cursor = builder.CreateLoad(Type::getInt64Ty(target->getContext()), cursor_ptr, "cursor");
-
-                // offset = 8
-                auto delta_offset = builder.CreateNSWSub(ConstantInt::get(Type::getInt64Ty(target->getContext()), 0), ConstantInt::get(Type::getInt64Ty(target->getContext()), 8));
-                auto delta_cursor = builder.CreateNSWSub(ConstantInt::get(Type::getInt64Ty(target->getContext()), 0), cursor);
-                auto delta_op = builder.CreateNSWAdd(delta_offset, delta_cursor);
-                // alignment 16 (15 = 16 - 1)
-                auto delta = builder.CreateAnd(delta_op, ConstantInt::get(Type::getInt64Ty(target->getContext()), 15), "delta");
-                auto result = builder.CreateNSWAdd(cursor, delta, "result");
-
-                auto new_cursor = builder.CreateNSWAdd(result, pool_osize);
-
-                auto limit_tls_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, limit_pos);
-                auto limit_ptr = builder.CreateBitCast(limit_tls_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "limit_ptr");
-                auto limit = builder.CreateLoad(Type::getInt64Ty(target->getContext()), limit_ptr, "limit");
-
-                auto gt_limit = builder.CreateICmpSGT(new_cursor, limit);
-
-                auto slowpath = BasicBlock::Create(target->getContext(), "slowpath", target->getFunction());
-                auto fastpath = BasicBlock::Create(target->getContext(), "fastpath", target->getFunction());
-
-                auto next_instr = target->getNextNode();
-                SmallVector<uint32_t, 2> Weights{1, 9};
-
-                MDBuilder MDB(F.getContext());
-                SplitBlockAndInsertIfThenElse(gt_limit, next_instr, &slowpath, &fastpath, false, false, MDB.createBranchWeights(Weights));
-
-                builder.SetInsertPoint(next_instr);
-                auto phiNode = builder.CreatePHI(target->getCalledFunction()->getReturnType(), 2, "phi_fast_slow");
-
-                // slowpath
-                builder.SetInsertPoint(slowpath);
-                auto pool_offs = ConstantInt::get(Type::getInt32Ty(F.getContext()), 1);
-                auto new_call = builder.CreateCall(smallAllocFunc, { ptls, pool_offs, pool_osize_i32, type });
-                new_call->setAttributes(new_call->getCalledFunction()->getAttributes());
-                builder.CreateBr(next_instr->getParent());
-
-                // fastpath
-                builder.SetInsertPoint(fastpath);
-                builder.CreateStore(new_cursor, cursor_ptr);
-
-                // ptls->gc_tls.gc_num.allocd += osize;
-                auto pool_alloc_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), offsetof(jl_tls_states_t, gc_tls_common) + offsetof(jl_gc_tls_states_common_t, gc_num));
-                auto pool_alloc_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, pool_alloc_pos);
-                auto pool_alloc_tls = builder.CreateBitCast(pool_alloc_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "pool_alloc");
-                auto pool_allocd = builder.CreateLoad(Type::getInt64Ty(target->getContext()), pool_alloc_tls);
-                auto pool_allocd_total = builder.CreateAdd(pool_allocd, pool_osize);
-                builder.CreateStore(pool_allocd_total, pool_alloc_tls);
-
-                auto v_raw = builder.CreateNSWAdd(result, ConstantInt::get(Type::getInt64Ty(target->getContext()), sizeof(jl_taggedvalue_t)));
-                auto v_as_ptr = builder.CreateIntToPtr(v_raw, smallAllocFunc->getReturnType());
-                builder.CreateBr(next_instr->getParent());
-
-                phiNode->addIncoming(new_call, slowpath);
-                phiNode->addIncoming(v_as_ptr, fastpath);
-                phiNode->takeName(target);
-                return phiNode;
-            }
+            // Assuming we use the first immix allocator.
+            // FIXME: We should get the allocator index and type from MMTk.
+            auto allocator_offset = offsetof(jl_tls_states_t, gc_tls) + offsetof(jl_gc_tls_states_t, mmtk_mutator) + offsetof(MMTkMutatorContext, allocators) + offsetof(Allocators, immix);
+
+            auto cursor_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), allocator_offset + offsetof(ImmixAllocator, cursor));
+            auto limit_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()),  allocator_offset + offsetof(ImmixAllocator, limit));
+
+            auto cursor_tls_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, cursor_pos);
+            auto cursor_ptr = builder.CreateBitCast(cursor_tls_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "cursor_ptr");
+            auto cursor = builder.CreateLoad(Type::getInt64Ty(target->getContext()), cursor_ptr, "cursor");
+
+            // offset = 8
+            auto delta_offset = builder.CreateNSWSub(ConstantInt::get(Type::getInt64Ty(target->getContext()), 0), ConstantInt::get(Type::getInt64Ty(target->getContext()), 8));
+            auto delta_cursor = builder.CreateNSWSub(ConstantInt::get(Type::getInt64Ty(target->getContext()), 0), cursor);
+            auto delta_op = builder.CreateNSWAdd(delta_offset, delta_cursor);
+            // alignment 16 (15 = 16 - 1)
+            auto delta = builder.CreateAnd(delta_op, ConstantInt::get(Type::getInt64Ty(target->getContext()), 15), "delta");
+            auto result = builder.CreateNSWAdd(cursor, delta, "result");
+
+            auto new_cursor = builder.CreateNSWAdd(result, pool_osize);
+
+            auto limit_tls_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, limit_pos);
+            auto limit_ptr = builder.CreateBitCast(limit_tls_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "limit_ptr");
+            auto limit = builder.CreateLoad(Type::getInt64Ty(target->getContext()), limit_ptr, "limit");
+
+            auto gt_limit = builder.CreateICmpSGT(new_cursor, limit);
+
+            auto slowpath = BasicBlock::Create(target->getContext(), "slowpath", target->getFunction());
+            auto fastpath = BasicBlock::Create(target->getContext(), "fastpath", target->getFunction());
+
+            auto next_instr = target->getNextNode();
+            SmallVector<uint32_t, 2> Weights{1, 9};
+
+            MDBuilder MDB(F.getContext());
+            SplitBlockAndInsertIfThenElse(gt_limit, next_instr, &slowpath, &fastpath, false, false, MDB.createBranchWeights(Weights));
+
+            builder.SetInsertPoint(next_instr);
+            auto phiNode = builder.CreatePHI(target->getCalledFunction()->getReturnType(), 2, "phi_fast_slow");
+
+            // slowpath
+            builder.SetInsertPoint(slowpath);
+            auto pool_offs = ConstantInt::get(Type::getInt32Ty(F.getContext()), 1);
+            auto new_call = builder.CreateCall(smallAllocFunc, { ptls, pool_offs, pool_osize_i32, type });
+            new_call->setAttributes(new_call->getCalledFunction()->getAttributes());
+            builder.CreateBr(next_instr->getParent());
+
+            // fastpath
+            builder.SetInsertPoint(fastpath);
+            builder.CreateStore(new_cursor, cursor_ptr);
+
+            // ptls->gc_tls.gc_num.allocd += osize;
+            auto pool_alloc_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), offsetof(jl_tls_states_t, gc_tls_common) + offsetof(jl_gc_tls_states_common_t, gc_num));
+            auto pool_alloc_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, pool_alloc_pos);
+            auto pool_alloc_tls = builder.CreateBitCast(pool_alloc_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "pool_alloc");
+            auto pool_allocd = builder.CreateLoad(Type::getInt64Ty(target->getContext()), pool_alloc_tls);
+            auto pool_allocd_total = builder.CreateAdd(pool_allocd, pool_osize);
+            builder.CreateStore(pool_allocd_total, pool_alloc_tls);
+
+            auto v_raw = builder.CreateNSWAdd(result, ConstantInt::get(Type::getInt64Ty(target->getContext()), sizeof(jl_taggedvalue_t)));
+            auto v_as_ptr = builder.CreateIntToPtr(v_raw, smallAllocFunc->getReturnType());
+            builder.CreateBr(next_instr->getParent());
+
+            phiNode->addIncoming(new_call, slowpath);
+            phiNode->addIncoming(v_as_ptr, fastpath);
+            phiNode->takeName(target);
+            return phiNode;
         }
     }
     return target;
@@ -2616,7 +2620,6 @@ static void replaceInstruction(
         ++it;
     }
 }
-#endif
 
 bool LateLowerGCFrame::runOnFunction(Function &F, bool *CFGModified) {
     initAll(*F.getParent());
@@ -2636,7 +2639,6 @@ bool LateLowerGCFrame::runOnFunction(Function &F, bool *CFGModified) {
     PlaceRootsAndUpdateCalls(Colors, S, CallFrames);
     CleanupIR(F, &S, CFGModified);
 
-#ifdef MMTK_GC
     // We lower the julia.gc_alloc_bytes intrinsic in this pass to insert slowpath/fastpath blocks for MMTk
     for (BasicBlock &BB : F) {
         for (auto it = BB.begin(); it != BB.end();) {
@@ -2658,7 +2660,6 @@ bool LateLowerGCFrame::runOnFunction(Function &F, bool *CFGModified) {
             ++it;
         }
     }
-#endif
 
     return true;
 }