diff --git a/src/gc-mmtk.c b/src/gc-mmtk.c
index 5b933c9572763..7ad5ffc2498d7 100644
--- a/src/gc-mmtk.c
+++ b/src/gc-mmtk.c
@@ -946,8 +946,6 @@ inline jl_value_t *jl_gc_alloc_(jl_ptls_t ptls, size_t sz, void *ty)
     return v;
 }
 
-
-
 // allocation wrappers that track allocation and let collection run
 JL_DLLEXPORT void *jl_gc_counted_malloc(size_t sz)
 {
diff --git a/src/llvm-late-gc-lowering.cpp b/src/llvm-late-gc-lowering.cpp
index 38b947d9c421b..62b74b2cdb238 100644
--- a/src/llvm-late-gc-lowering.cpp
+++ b/src/llvm-late-gc-lowering.cpp
@@ -2607,22 +2607,23 @@ bool LateLowerGCFrame::runOnFunction(Function &F, bool *CFGModified) {
 
 #ifdef MMTK_GC
     // We lower the julia.gc_alloc_bytes intrinsic in this pass to insert slowpath/fastpath blocks for MMTk
-    for (BasicBlock &BB : F) {
-        for (auto it = BB.begin(); it != BB.end();) {
-            auto *CI = dyn_cast<CallInst>(&*it);
-            if (!CI) {
-                ++it;
-                continue;
-            }
+    auto GCAllocBytes = getOrNull(jl_intrinsics::GCAllocBytes);
 
-            Value *callee = CI->getCalledOperand();
-            assert(callee);
-
-            auto GCAllocBytes = getOrNull(jl_intrinsics::GCAllocBytes);
-            if (GCAllocBytes == callee) {
+    if (GCAllocBytes) {
+        for (auto it = GCAllocBytes->user_begin(); it != GCAllocBytes->user_end(); ) {
+            if (auto *CI = dyn_cast<CallInst>(*it)) {
                 *CFGModified = true;
-                replaceInstruction(CI, lowerGCAllocBytesLate(CI, F), it);
-                continue;
+
+                Value *callee = CI->getCalledOperand();
+                assert(callee == GCAllocBytes);
+
+                auto newI = lowerGCAllocBytesLate(CI, F);
+                if (newI != CI) {
+                    ++it;
+                    CI->replaceAllUsesWith(newI);
+                    CI->eraseFromParent();
+                    continue;
+                }
             }
             ++it;
         }