diff --git a/src/Makefile b/src/Makefile
index 1dc8414430280..a7c548c0776a2 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -347,7 +347,7 @@ $(BUILDDIR)/debuginfo.o $(BUILDDIR)/debuginfo.dbg.obj: $(addprefix $(SRCDIR)/,de
 $(BUILDDIR)/disasm.o $(BUILDDIR)/disasm.dbg.obj: $(SRCDIR)/debuginfo.h $(SRCDIR)/processor.h
 $(BUILDDIR)/gc-debug.o $(BUILDDIR)/gc-debug.dbg.obj: $(SRCDIR)/gc-common.h $(SRCDIR)/gc-stock.h
 $(BUILDDIR)/gc-pages.o $(BUILDDIR)/gc-pages.dbg.obj: $(SRCDIR)/gc-common.h $(SRCDIR)/gc-stock.h
-$(BUILDDIR)/gc-mmtk.o $(BUILDDIR)/gc-mmtk.dbg.obj: $(SRCDIR)/gc-common.h $(SRCDIR)/gc-mmtk.h $(SRCDIR)/gc-heap-snapshot.h $(SRCDIR)/gc-alloc-profiler.h
+$(BUILDDIR)/gc-mmtk.o $(BUILDDIR)/gc-mmtk.dbg.obj: $(SRCDIR)/gc-common.h $(SRCDIR)/gc-heap-snapshot.h $(SRCDIR)/gc-alloc-profiler.h
 $(BUILDDIR)/gc-stacks.o $(BUILDDIR)/gc-stacks.dbg.obj: $(SRCDIR)/gc-common.h $(SRCDIR)/gc-stock.h
 $(BUILDDIR)/gc-stock.o $(BUILDDIR)/gc.dbg.obj: $(SRCDIR)/gc-common.h $(SRCDIR)/gc-stock.h $(SRCDIR)/gc-heap-snapshot.h $(SRCDIR)/gc-alloc-profiler.h $(SRCDIR)/gc-page-profiler.h
 $(BUILDDIR)/gc-heap-snapshot.o $(BUILDDIR)/gc-heap-snapshot.dbg.obj: $(SRCDIR)/gc-heap-snapshot.h
diff --git a/src/gc-common.h b/src/gc-common.h
index 49e8f038bb038..2feaf8923b9bd 100644
--- a/src/gc-common.h
+++ b/src/gc-common.h
@@ -16,6 +16,31 @@
 extern "C" {
 #endif
 
+// =========================================================================== //
+// GC Big objects
+// =========================================================================== //
+
+JL_EXTENSION typedef struct _bigval_t {
+    struct _bigval_t *next;
+    struct _bigval_t *prev;
+    size_t sz;
+#ifdef _P64 // Add padding so that the value is 64-byte aligned
+    // (8 pointers of 8 bytes each) - (4 other pointers in struct)
+    void *_padding[8 - 4];
+#else
+    // (16 pointers of 4 bytes each) - (4 other pointers in struct)
+    void *_padding[16 - 4];
+#endif
+    //struct jl_taggedvalue_t <>;
+    union {
+        uintptr_t header;
+        struct {
+            uintptr_t gc:2;
+        } bits;
+    };
+    // must be 64-byte aligned here, in 32 & 64 bit modes
+} bigval_t;
+
 // =========================================================================== //
 // GC Callbacks
 // =========================================================================== //
diff --git a/src/gc-mmtk.c b/src/gc-mmtk.c
index a7f34c7ea6512..6519c6c34c48d 100644
--- a/src/gc-mmtk.c
+++ b/src/gc-mmtk.c
@@ -1,6 +1,5 @@
 #include "gc-common.h"
 #include "mmtkMutator.h"
-#include "gc-mmtk.h"
 #include "threading.h"
 
 // File exists in the binding
diff --git a/src/gc-mmtk.h b/src/gc-mmtk.h
deleted file mode 100644
index e83d255fe4835..0000000000000
--- a/src/gc-mmtk.h
+++ /dev/null
@@ -1,30 +0,0 @@
-#include <stdbool.h>
-#include <stddef.h>
-#include <stdint.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-JL_EXTENSION typedef struct _bigval_t {
-    size_t sz;
-#ifdef _P64 // Add padding so that the value is 64-byte aligned
-    // (8 pointers of 8 bytes each) - (2 other pointers in struct)
-    void *_padding[8 - 2];
-#else
-    // (16 pointers of 4 bytes each) - (2 other pointers in struct)
-    void *_padding[16 - 2];
-#endif
-    //struct jl_taggedvalue_t <>;
-    union {
-        uintptr_t header;
-        struct {
-            uintptr_t gc:2;
-        } bits;
-    };
-    // must be 64-byte aligned here, in 32 & 64 bit modes
-} bigval_t;
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/src/gc-stock.h b/src/gc-stock.h
index 710c3adf46af3..d478ee1366da0 100644
--- a/src/gc-stock.h
+++ b/src/gc-stock.h
@@ -19,6 +19,7 @@
 #include "julia_internal.h"
 #include "julia_assert.h"
 #include "threading.h"
+#include "gc-common.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -84,27 +85,6 @@ typedef struct _jl_gc_chunk_t {
 
 extern uintptr_t gc_bigval_sentinel_tag;
 
-JL_EXTENSION typedef struct _bigval_t {
-    struct _bigval_t *next;
-    struct _bigval_t *prev;
-    size_t sz;
-#ifdef _P64 // Add padding so that the value is 64-byte aligned
-    // (8 pointers of 8 bytes each) - (4 other pointers in struct)
-    void *_padding[8 - 4];
-#else
-    // (16 pointers of 4 bytes each) - (4 other pointers in struct)
-    void *_padding[16 - 4];
-#endif
-    //struct jl_taggedvalue_t <>;
-    union {
-        uintptr_t header;
-        struct {
-            uintptr_t gc:2;
-        } bits;
-    };
-    // must be 64-byte aligned here, in 32 & 64 bit modes
-} bigval_t;
-
 // pool page metadata
 typedef struct _jl_gc_pagemeta_t {
     // next metadata structure in per-thread list
diff --git a/src/llvm-gc-interface-passes.h b/src/llvm-gc-interface-passes.h
index fb7fbb729fb47..b93fd2ddd91dc 100644
--- a/src/llvm-gc-interface-passes.h
+++ b/src/llvm-gc-interface-passes.h
@@ -368,9 +368,7 @@ struct LateLowerGCFrame:  private JuliaPassContext {
     void RefineLiveSet(LargeSparseBitVector &LS, State &S, ArrayRef<int> CalleeRoots);
     Value *EmitTagPtr(IRBuilder<> &builder, Type *T, Type *T_size, Value *V);
     Value *EmitLoadTag(IRBuilder<> &builder, Type *T_size, Value *V);
-#ifdef MMTK_GC
     Value* lowerGCAllocBytesLate(CallInst *target, Function &F);
-#endif
 };
 
 // The final GC lowering pass. This pass lowers platform-agnostic GC
diff --git a/src/llvm-late-gc-lowering-mmtk.cpp b/src/llvm-late-gc-lowering-mmtk.cpp
index 786be78498074..6342fc9883845 100644
--- a/src/llvm-late-gc-lowering-mmtk.cpp
+++ b/src/llvm-late-gc-lowering-mmtk.cpp
@@ -1,3 +1,5 @@
+// This file is a part of Julia. License is MIT: https://julialang.org/license
+
 #include "llvm-gc-interface-passes.h"
 
 void LateLowerGCFrame::CleanupGCPreserve(Function &F, CallInst *CI, Value *callee, Type *T_size) {
@@ -45,3 +47,99 @@ void LateLowerGCFrame::CleanupGCPreserve(Function &F, CallInst *CI, Value *calle
         builder.CreateCall(getOrDeclare(jl_well_known::GCPreserveEndHook), {});
     }
 }
+
+Value* LateLowerGCFrame::lowerGCAllocBytesLate(CallInst *target, Function &F)
+{
+    assert(target->arg_size() == 3);
+
+    IRBuilder<> builder(target);
+    auto ptls = target->getArgOperand(0);
+    auto type = target->getArgOperand(2);
+    if (auto CI = dyn_cast<ConstantInt>(target->getArgOperand(1))) {
+        size_t sz = (size_t)CI->getZExtValue();
+        // This is strongly architecture and OS dependent
+        int osize;
+        int offset = jl_gc_classify_pools(sz, &osize);
+        if (offset >= 0) {
+            // In this case instead of lowering julia.gc_alloc_bytes to jl_gc_small_alloc
+            // We do a slowpath/fastpath check and lower it only on the slowpath, returning
+            // the cursor and updating it in the fastpath.
+            auto pool_osize_i32 = ConstantInt::get(Type::getInt32Ty(F.getContext()), osize);
+            auto pool_osize = ConstantInt::get(Type::getInt64Ty(F.getContext()), osize);
+
+            // Should we generate fastpath allocation sequence here? We should always generate fastpath here for MMTk.
+            // Setting this to false will increase allocation overhead a lot, and should only be used for debugging.
+            const bool INLINE_FASTPATH_ALLOCATION = true;
+
+            if (INLINE_FASTPATH_ALLOCATION) {
+                // Assuming we use the first immix allocator.
+                // FIXME: We should get the allocator index and type from MMTk.
+                auto allocator_offset = offsetof(jl_tls_states_t, gc_tls) + offsetof(jl_gc_tls_states_t, mmtk_mutator) + offsetof(MMTkMutatorContext, allocators) + offsetof(Allocators, immix);
+
+                auto cursor_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), allocator_offset + offsetof(ImmixAllocator, cursor));
+                auto limit_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()),  allocator_offset + offsetof(ImmixAllocator, limit));
+
+                auto cursor_tls_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, cursor_pos);
+                auto cursor_ptr = builder.CreateBitCast(cursor_tls_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "cursor_ptr");
+                auto cursor = builder.CreateLoad(Type::getInt64Ty(target->getContext()), cursor_ptr, "cursor");
+
+                // offset = 8
+                auto delta_offset = builder.CreateNSWSub(ConstantInt::get(Type::getInt64Ty(target->getContext()), 0), ConstantInt::get(Type::getInt64Ty(target->getContext()), 8));
+                auto delta_cursor = builder.CreateNSWSub(ConstantInt::get(Type::getInt64Ty(target->getContext()), 0), cursor);
+                auto delta_op = builder.CreateNSWAdd(delta_offset, delta_cursor);
+                // alignment 16 (15 = 16 - 1)
+                auto delta = builder.CreateAnd(delta_op, ConstantInt::get(Type::getInt64Ty(target->getContext()), 15), "delta");
+                auto result = builder.CreateNSWAdd(cursor, delta, "result");
+
+                auto new_cursor = builder.CreateNSWAdd(result, pool_osize);
+
+                auto limit_tls_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, limit_pos);
+                auto limit_ptr = builder.CreateBitCast(limit_tls_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "limit_ptr");
+                auto limit = builder.CreateLoad(Type::getInt64Ty(target->getContext()), limit_ptr, "limit");
+
+                auto gt_limit = builder.CreateICmpSGT(new_cursor, limit);
+
+                auto slowpath = BasicBlock::Create(target->getContext(), "slowpath", target->getFunction());
+                auto fastpath = BasicBlock::Create(target->getContext(), "fastpath", target->getFunction());
+
+                auto next_instr = target->getNextNode();
+                SmallVector<uint32_t, 2> Weights{1, 9};
+
+                MDBuilder MDB(F.getContext());
+                SplitBlockAndInsertIfThenElse(gt_limit, next_instr, &slowpath, &fastpath, false, false, MDB.createBranchWeights(Weights));
+
+                builder.SetInsertPoint(next_instr);
+                auto phiNode = builder.CreatePHI(target->getCalledFunction()->getReturnType(), 2, "phi_fast_slow");
+
+                // slowpath
+                builder.SetInsertPoint(slowpath);
+                auto pool_offs = ConstantInt::get(Type::getInt32Ty(F.getContext()), 1);
+                auto new_call = builder.CreateCall(smallAllocFunc, { ptls, pool_offs, pool_osize_i32, type });
+                new_call->setAttributes(new_call->getCalledFunction()->getAttributes());
+                builder.CreateBr(next_instr->getParent());
+
+                // fastpath
+                builder.SetInsertPoint(fastpath);
+                builder.CreateStore(new_cursor, cursor_ptr);
+
+                // ptls->gc_tls.gc_num.allocd += osize;
+                auto pool_alloc_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), offsetof(jl_tls_states_t, gc_tls_common) + offsetof(jl_gc_tls_states_common_t, gc_num));
+                auto pool_alloc_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, pool_alloc_pos);
+                auto pool_alloc_tls = builder.CreateBitCast(pool_alloc_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "pool_alloc");
+                auto pool_allocd = builder.CreateLoad(Type::getInt64Ty(target->getContext()), pool_alloc_tls);
+                auto pool_allocd_total = builder.CreateAdd(pool_allocd, pool_osize);
+                builder.CreateStore(pool_allocd_total, pool_alloc_tls);
+
+                auto v_raw = builder.CreateNSWAdd(result, ConstantInt::get(Type::getInt64Ty(target->getContext()), sizeof(jl_taggedvalue_t)));
+                auto v_as_ptr = builder.CreateIntToPtr(v_raw, smallAllocFunc->getReturnType());
+                builder.CreateBr(next_instr->getParent());
+
+                phiNode->addIncoming(new_call, slowpath);
+                phiNode->addIncoming(v_as_ptr, fastpath);
+                phiNode->takeName(target);
+                return phiNode;
+            }
+        }
+    }
+    return target;
+}
diff --git a/src/llvm-late-gc-lowering-stock.cpp b/src/llvm-late-gc-lowering-stock.cpp
index 0f2d1fa009d88..1e1364f6b61e3 100644
--- a/src/llvm-late-gc-lowering-stock.cpp
+++ b/src/llvm-late-gc-lowering-stock.cpp
@@ -1,5 +1,13 @@
+// This file is a part of Julia. License is MIT: https://julialang.org/license
+
 #include "llvm-gc-interface-passes.h"
 
 void LateLowerGCFrame::CleanupGCPreserve(Function &F, CallInst *CI, Value *callee, Type *T_size) {
     // Do nothing for the stock GC
 }
+
+Value* LateLowerGCFrame::lowerGCAllocBytesLate(CallInst *target, Function &F)
+{
+    // Do nothing for the stock GC
+    return target;
+}
diff --git a/src/llvm-late-gc-lowering.cpp b/src/llvm-late-gc-lowering.cpp
index fc62ab5109133..841fdc2ef6332 100644
--- a/src/llvm-late-gc-lowering.cpp
+++ b/src/llvm-late-gc-lowering.cpp
@@ -2499,119 +2499,6 @@ void LateLowerGCFrame::PlaceRootsAndUpdateCalls(SmallVectorImpl<int> &Colors, St
     }
 }
 
-#ifdef MMTK_GC
-Value* LateLowerGCFrame::lowerGCAllocBytesLate(CallInst *target, Function &F)
-{
-    assert(target->arg_size() == 3);
-
-    IRBuilder<> builder(target);
-    auto ptls = target->getArgOperand(0);
-    auto type = target->getArgOperand(2);
-    if (auto CI = dyn_cast<ConstantInt>(target->getArgOperand(1))) {
-        size_t sz = (size_t)CI->getZExtValue();
-        // This is strongly architecture and OS dependent
-        int osize;
-        int offset = jl_gc_classify_pools(sz, &osize);
-        if (offset >= 0) {
-            // In this case instead of lowering julia.gc_alloc_bytes to jl_gc_small_alloc
-            // We do a slowpath/fastpath check and lower it only on the slowpath, returning
-            // the cursor and updating it in the fastpath.
-            auto pool_osize_i32 = ConstantInt::get(Type::getInt32Ty(F.getContext()), osize);
-            auto pool_osize = ConstantInt::get(Type::getInt64Ty(F.getContext()), osize);
-
-            // Should we generate fastpath allocation sequence here? We should always generate fastpath here for MMTk.
-            // Setting this to false will increase allocation overhead a lot, and should only be used for debugging.
-            const bool INLINE_FASTPATH_ALLOCATION = true;
-
-            if (INLINE_FASTPATH_ALLOCATION) {
-                // Assuming we use the first immix allocator.
-                // FIXME: We should get the allocator index and type from MMTk.
-                auto allocator_offset = offsetof(jl_tls_states_t, gc_tls) + offsetof(jl_gc_tls_states_t, mmtk_mutator) + offsetof(MMTkMutatorContext, allocators) + offsetof(Allocators, immix);
-
-                auto cursor_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), allocator_offset + offsetof(ImmixAllocator, cursor));
-                auto limit_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()),  allocator_offset + offsetof(ImmixAllocator, limit));
-
-                auto cursor_tls_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, cursor_pos);
-                auto cursor_ptr = builder.CreateBitCast(cursor_tls_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "cursor_ptr");
-                auto cursor = builder.CreateLoad(Type::getInt64Ty(target->getContext()), cursor_ptr, "cursor");
-
-                // offset = 8
-                auto delta_offset = builder.CreateNSWSub(ConstantInt::get(Type::getInt64Ty(target->getContext()), 0), ConstantInt::get(Type::getInt64Ty(target->getContext()), 8));
-                auto delta_cursor = builder.CreateNSWSub(ConstantInt::get(Type::getInt64Ty(target->getContext()), 0), cursor);
-                auto delta_op = builder.CreateNSWAdd(delta_offset, delta_cursor);
-                // alignment 16 (15 = 16 - 1)
-                auto delta = builder.CreateAnd(delta_op, ConstantInt::get(Type::getInt64Ty(target->getContext()), 15), "delta");
-                auto result = builder.CreateNSWAdd(cursor, delta, "result");
-
-                auto new_cursor = builder.CreateNSWAdd(result, pool_osize);
-
-                auto limit_tls_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, limit_pos);
-                auto limit_ptr = builder.CreateBitCast(limit_tls_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "limit_ptr");
-                auto limit = builder.CreateLoad(Type::getInt64Ty(target->getContext()), limit_ptr, "limit");
-
-                auto gt_limit = builder.CreateICmpSGT(new_cursor, limit);
-
-                auto slowpath = BasicBlock::Create(target->getContext(), "slowpath", target->getFunction());
-                auto fastpath = BasicBlock::Create(target->getContext(), "fastpath", target->getFunction());
-
-                auto next_instr = target->getNextNode();
-                SmallVector<uint32_t, 2> Weights{1, 9};
-
-                MDBuilder MDB(F.getContext());
-                SplitBlockAndInsertIfThenElse(gt_limit, next_instr, &slowpath, &fastpath, false, false, MDB.createBranchWeights(Weights));
-
-                builder.SetInsertPoint(next_instr);
-                auto phiNode = builder.CreatePHI(target->getCalledFunction()->getReturnType(), 2, "phi_fast_slow");
-
-                // slowpath
-                builder.SetInsertPoint(slowpath);
-                auto pool_offs = ConstantInt::get(Type::getInt32Ty(F.getContext()), 1);
-                auto new_call = builder.CreateCall(smallAllocFunc, { ptls, pool_offs, pool_osize_i32, type });
-                new_call->setAttributes(new_call->getCalledFunction()->getAttributes());
-                builder.CreateBr(next_instr->getParent());
-
-                // fastpath
-                builder.SetInsertPoint(fastpath);
-                builder.CreateStore(new_cursor, cursor_ptr);
-
-                // ptls->gc_tls.gc_num.allocd += osize;
-                auto pool_alloc_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), offsetof(jl_tls_states_t, gc_tls_common) + offsetof(jl_gc_tls_states_common_t, gc_num));
-                auto pool_alloc_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, pool_alloc_pos);
-                auto pool_alloc_tls = builder.CreateBitCast(pool_alloc_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "pool_alloc");
-                auto pool_allocd = builder.CreateLoad(Type::getInt64Ty(target->getContext()), pool_alloc_tls);
-                auto pool_allocd_total = builder.CreateAdd(pool_allocd, pool_osize);
-                builder.CreateStore(pool_allocd_total, pool_alloc_tls);
-
-                auto v_raw = builder.CreateNSWAdd(result, ConstantInt::get(Type::getInt64Ty(target->getContext()), sizeof(jl_taggedvalue_t)));
-                auto v_as_ptr = builder.CreateIntToPtr(v_raw, smallAllocFunc->getReturnType());
-                builder.CreateBr(next_instr->getParent());
-
-                phiNode->addIncoming(new_call, slowpath);
-                phiNode->addIncoming(v_as_ptr, fastpath);
-                phiNode->takeName(target);
-                return phiNode;
-            }
-        }
-    }
-    return target;
-}
-
-template<typename TIterator>
-static void replaceInstruction(
-    Instruction *oldInstruction,
-    Value *newInstruction,
-    TIterator &it)
-{
-    if (newInstruction != oldInstruction) {
-        oldInstruction->replaceAllUsesWith(newInstruction);
-        it = oldInstruction->eraseFromParent();
-    }
-    else {
-        ++it;
-    }
-}
-#endif
-
 bool LateLowerGCFrame::runOnFunction(Function &F, bool *CFGModified) {
     initAll(*F.getParent());
     smallAllocFunc = getOrDeclare(jl_well_known::GCSmallAlloc);
@@ -2630,29 +2517,29 @@ bool LateLowerGCFrame::runOnFunction(Function &F, bool *CFGModified) {
     PlaceRootsAndUpdateCalls(Colors, S, CallFrames);
     CleanupIR(F, &S, CFGModified);
 
-#ifdef MMTK_GC
-    // We lower the julia.gc_alloc_bytes intrinsic in this pass to insert slowpath/fastpath blocks for MMTk
-    for (BasicBlock &BB : F) {
-        for (auto it = BB.begin(); it != BB.end();) {
-            auto *CI = dyn_cast<CallInst>(&*it);
-            if (!CI) {
-                ++it;
-                continue;
-            }
 
-            Value *callee = CI->getCalledOperand();
-            assert(callee);
+    // We lower the julia.gc_alloc_bytes intrinsic in this pass to insert slowpath/fastpath blocks for MMTk
+    // For now, we do nothing for the Stock GC
+    auto GCAllocBytes = getOrNull(jl_intrinsics::GCAllocBytes);
 
-            auto GCAllocBytes = getOrNull(jl_intrinsics::GCAllocBytes);
-            if (GCAllocBytes == callee) {
+    if (GCAllocBytes) {
+        for (auto it = GCAllocBytes->user_begin(); it != GCAllocBytes->user_end(); ) {
+            if (auto *CI = dyn_cast<CallInst>(*it)) {
                 *CFGModified = true;
-                replaceInstruction(CI, lowerGCAllocBytesLate(CI, F), it);
-                continue;
+
+                assert(CI->getCalledOperand() == GCAllocBytes);
+
+                auto newI = lowerGCAllocBytesLate(CI, F);
+                if (newI != CI) {
+                    ++it;
+                    CI->replaceAllUsesWith(newI);
+                    CI->eraseFromParent();
+                    continue;
+                }
             }
             ++it;
         }
     }
-#endif
 
     return true;
 }