diff --git a/src/Makefile b/src/Makefile index 1dc8414430280..a7c548c0776a2 100644 --- a/src/Makefile +++ b/src/Makefile @@ -347,7 +347,7 @@ $(BUILDDIR)/debuginfo.o $(BUILDDIR)/debuginfo.dbg.obj: $(addprefix $(SRCDIR)/,de $(BUILDDIR)/disasm.o $(BUILDDIR)/disasm.dbg.obj: $(SRCDIR)/debuginfo.h $(SRCDIR)/processor.h $(BUILDDIR)/gc-debug.o $(BUILDDIR)/gc-debug.dbg.obj: $(SRCDIR)/gc-common.h $(SRCDIR)/gc-stock.h $(BUILDDIR)/gc-pages.o $(BUILDDIR)/gc-pages.dbg.obj: $(SRCDIR)/gc-common.h $(SRCDIR)/gc-stock.h -$(BUILDDIR)/gc-mmtk.o $(BUILDDIR)/gc-mmtk.dbg.obj: $(SRCDIR)/gc-common.h $(SRCDIR)/gc-mmtk.h $(SRCDIR)/gc-heap-snapshot.h $(SRCDIR)/gc-alloc-profiler.h +$(BUILDDIR)/gc-mmtk.o $(BUILDDIR)/gc-mmtk.dbg.obj: $(SRCDIR)/gc-common.h $(SRCDIR)/gc-heap-snapshot.h $(SRCDIR)/gc-alloc-profiler.h $(BUILDDIR)/gc-stacks.o $(BUILDDIR)/gc-stacks.dbg.obj: $(SRCDIR)/gc-common.h $(SRCDIR)/gc-stock.h $(BUILDDIR)/gc-stock.o $(BUILDDIR)/gc.dbg.obj: $(SRCDIR)/gc-common.h $(SRCDIR)/gc-stock.h $(SRCDIR)/gc-heap-snapshot.h $(SRCDIR)/gc-alloc-profiler.h $(SRCDIR)/gc-page-profiler.h $(BUILDDIR)/gc-heap-snapshot.o $(BUILDDIR)/gc-heap-snapshot.dbg.obj: $(SRCDIR)/gc-heap-snapshot.h diff --git a/src/gc-common.h b/src/gc-common.h index 49e8f038bb038..2feaf8923b9bd 100644 --- a/src/gc-common.h +++ b/src/gc-common.h @@ -16,6 +16,31 @@ extern "C" { #endif +// =========================================================================== // +// GC Big objects +// =========================================================================== // + +JL_EXTENSION typedef struct _bigval_t { + struct _bigval_t *next; + struct _bigval_t *prev; + size_t sz; +#ifdef _P64 // Add padding so that the value is 64-byte aligned + // (8 pointers of 8 bytes each) - (4 other pointers in struct) + void *_padding[8 - 4]; +#else + // (16 pointers of 4 bytes each) - (4 other pointers in struct) + void *_padding[16 - 4]; +#endif + //struct jl_taggedvalue_t <>; + union { + uintptr_t header; + struct { + uintptr_t gc:2; + } bits; + }; + // must be 64-byte aligned here, in 32 & 64 bit modes +} bigval_t; + // =========================================================================== // // GC Callbacks // =========================================================================== // diff --git a/src/gc-mmtk.c b/src/gc-mmtk.c index a7f34c7ea6512..6519c6c34c48d 100644 --- a/src/gc-mmtk.c +++ b/src/gc-mmtk.c @@ -1,6 +1,5 @@ #include "gc-common.h" #include "mmtkMutator.h" -#include "gc-mmtk.h" #include "threading.h" // File exists in the binding diff --git a/src/gc-mmtk.h b/src/gc-mmtk.h deleted file mode 100644 index e83d255fe4835..0000000000000 --- a/src/gc-mmtk.h +++ /dev/null @@ -1,30 +0,0 @@ -#include -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -JL_EXTENSION typedef struct _bigval_t { - size_t sz; -#ifdef _P64 // Add padding so that the value is 64-byte aligned - // (8 pointers of 8 bytes each) - (2 other pointers in struct) - void *_padding[8 - 2]; -#else - // (16 pointers of 4 bytes each) - (2 other pointers in struct) - void *_padding[16 - 2]; -#endif - //struct jl_taggedvalue_t <>; - union { - uintptr_t header; - struct { - uintptr_t gc:2; - } bits; - }; - // must be 64-byte aligned here, in 32 & 64 bit modes -} bigval_t; - -#ifdef __cplusplus -} -#endif diff --git a/src/gc-stock.h b/src/gc-stock.h index 710c3adf46af3..d478ee1366da0 100644 --- a/src/gc-stock.h +++ b/src/gc-stock.h @@ -19,6 +19,7 @@ #include "julia_internal.h" #include "julia_assert.h" #include "threading.h" +#include "gc-common.h" #ifdef __cplusplus extern "C" { @@ -84,27 +85,6 @@ typedef struct _jl_gc_chunk_t { extern uintptr_t gc_bigval_sentinel_tag; -JL_EXTENSION typedef struct _bigval_t { - struct _bigval_t *next; - struct _bigval_t *prev; - size_t sz; -#ifdef _P64 // Add padding so that the value is 64-byte aligned - // (8 pointers of 8 bytes each) - (4 other pointers in struct) - void *_padding[8 - 4]; -#else - // (16 pointers of 4 bytes each) - (4 other pointers in struct) - void *_padding[16 - 4]; -#endif - //struct jl_taggedvalue_t <>; - union { - uintptr_t header; - struct { - uintptr_t gc:2; - } bits; - }; - // must be 64-byte aligned here, in 32 & 64 bit modes -} bigval_t; - // pool page metadata typedef struct _jl_gc_pagemeta_t { // next metadata structure in per-thread list diff --git a/src/llvm-gc-interface-passes.h b/src/llvm-gc-interface-passes.h index fb7fbb729fb47..b93fd2ddd91dc 100644 --- a/src/llvm-gc-interface-passes.h +++ b/src/llvm-gc-interface-passes.h @@ -368,9 +368,7 @@ struct LateLowerGCFrame: private JuliaPassContext { void RefineLiveSet(LargeSparseBitVector &LS, State &S, ArrayRef CalleeRoots); Value *EmitTagPtr(IRBuilder<> &builder, Type *T, Type *T_size, Value *V); Value *EmitLoadTag(IRBuilder<> &builder, Type *T_size, Value *V); -#ifdef MMTK_GC Value* lowerGCAllocBytesLate(CallInst *target, Function &F); -#endif }; // The final GC lowering pass. This pass lowers platform-agnostic GC diff --git a/src/llvm-late-gc-lowering-mmtk.cpp b/src/llvm-late-gc-lowering-mmtk.cpp index 786be78498074..6342fc9883845 100644 --- a/src/llvm-late-gc-lowering-mmtk.cpp +++ b/src/llvm-late-gc-lowering-mmtk.cpp @@ -1,3 +1,5 @@ +// This file is a part of Julia. License is MIT: https://julialang.org/license + #include "llvm-gc-interface-passes.h" void LateLowerGCFrame::CleanupGCPreserve(Function &F, CallInst *CI, Value *callee, Type *T_size) { @@ -45,3 +47,99 @@ void LateLowerGCFrame::CleanupGCPreserve(Function &F, CallInst *CI, Value *calle builder.CreateCall(getOrDeclare(jl_well_known::GCPreserveEndHook), {}); } } + +Value* LateLowerGCFrame::lowerGCAllocBytesLate(CallInst *target, Function &F) +{ + assert(target->arg_size() == 3); + + IRBuilder<> builder(target); + auto ptls = target->getArgOperand(0); + auto type = target->getArgOperand(2); + if (auto CI = dyn_cast(target->getArgOperand(1))) { + size_t sz = (size_t)CI->getZExtValue(); + // This is strongly architecture and OS dependent + int osize; + int offset = jl_gc_classify_pools(sz, &osize); + if (offset >= 0) { + // In this case instead of lowering julia.gc_alloc_bytes to jl_gc_small_alloc + // We do a slowpath/fastpath check and lower it only on the slowpath, returning + // the cursor and updating it in the fastpath. + auto pool_osize_i32 = ConstantInt::get(Type::getInt32Ty(F.getContext()), osize); + auto pool_osize = ConstantInt::get(Type::getInt64Ty(F.getContext()), osize); + + // Should we generate fastpath allocation sequence here? We should always generate fastpath here for MMTk. + // Setting this to false will increase allocation overhead a lot, and should only be used for debugging. + const bool INLINE_FASTPATH_ALLOCATION = true; + + if (INLINE_FASTPATH_ALLOCATION) { + // Assuming we use the first immix allocator. + // FIXME: We should get the allocator index and type from MMTk. + auto allocator_offset = offsetof(jl_tls_states_t, gc_tls) + offsetof(jl_gc_tls_states_t, mmtk_mutator) + offsetof(MMTkMutatorContext, allocators) + offsetof(Allocators, immix); + + auto cursor_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), allocator_offset + offsetof(ImmixAllocator, cursor)); + auto limit_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), allocator_offset + offsetof(ImmixAllocator, limit)); + + auto cursor_tls_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, cursor_pos); + auto cursor_ptr = builder.CreateBitCast(cursor_tls_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "cursor_ptr"); + auto cursor = builder.CreateLoad(Type::getInt64Ty(target->getContext()), cursor_ptr, "cursor"); + + // offset = 8 + auto delta_offset = builder.CreateNSWSub(ConstantInt::get(Type::getInt64Ty(target->getContext()), 0), ConstantInt::get(Type::getInt64Ty(target->getContext()), 8)); + auto delta_cursor = builder.CreateNSWSub(ConstantInt::get(Type::getInt64Ty(target->getContext()), 0), cursor); + auto delta_op = builder.CreateNSWAdd(delta_offset, delta_cursor); + // alignment 16 (15 = 16 - 1) + auto delta = builder.CreateAnd(delta_op, ConstantInt::get(Type::getInt64Ty(target->getContext()), 15), "delta"); + auto result = builder.CreateNSWAdd(cursor, delta, "result"); + + auto new_cursor = builder.CreateNSWAdd(result, pool_osize); + + auto limit_tls_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, limit_pos); + auto limit_ptr = builder.CreateBitCast(limit_tls_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "limit_ptr"); + auto limit = builder.CreateLoad(Type::getInt64Ty(target->getContext()), limit_ptr, "limit"); + + auto gt_limit = builder.CreateICmpSGT(new_cursor, limit); + + auto slowpath = BasicBlock::Create(target->getContext(), "slowpath", target->getFunction()); + auto fastpath = BasicBlock::Create(target->getContext(), "fastpath", target->getFunction()); + + auto next_instr = target->getNextNode(); + SmallVector Weights{1, 9}; + + MDBuilder MDB(F.getContext()); + SplitBlockAndInsertIfThenElse(gt_limit, next_instr, &slowpath, &fastpath, false, false, MDB.createBranchWeights(Weights)); + + builder.SetInsertPoint(next_instr); + auto phiNode = builder.CreatePHI(target->getCalledFunction()->getReturnType(), 2, "phi_fast_slow"); + + // slowpath + builder.SetInsertPoint(slowpath); + auto pool_offs = ConstantInt::get(Type::getInt32Ty(F.getContext()), 1); + auto new_call = builder.CreateCall(smallAllocFunc, { ptls, pool_offs, pool_osize_i32, type }); + new_call->setAttributes(new_call->getCalledFunction()->getAttributes()); + builder.CreateBr(next_instr->getParent()); + + // fastpath + builder.SetInsertPoint(fastpath); + builder.CreateStore(new_cursor, cursor_ptr); + + // ptls->gc_tls.gc_num.allocd += osize; + auto pool_alloc_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), offsetof(jl_tls_states_t, gc_tls_common) + offsetof(jl_gc_tls_states_common_t, gc_num)); + auto pool_alloc_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, pool_alloc_pos); + auto pool_alloc_tls = builder.CreateBitCast(pool_alloc_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "pool_alloc"); + auto pool_allocd = builder.CreateLoad(Type::getInt64Ty(target->getContext()), pool_alloc_tls); + auto pool_allocd_total = builder.CreateAdd(pool_allocd, pool_osize); + builder.CreateStore(pool_allocd_total, pool_alloc_tls); + + auto v_raw = builder.CreateNSWAdd(result, ConstantInt::get(Type::getInt64Ty(target->getContext()), sizeof(jl_taggedvalue_t))); + auto v_as_ptr = builder.CreateIntToPtr(v_raw, smallAllocFunc->getReturnType()); + builder.CreateBr(next_instr->getParent()); + + phiNode->addIncoming(new_call, slowpath); + phiNode->addIncoming(v_as_ptr, fastpath); + phiNode->takeName(target); + return phiNode; + } + } + } + return target; +} diff --git a/src/llvm-late-gc-lowering-stock.cpp b/src/llvm-late-gc-lowering-stock.cpp index 0f2d1fa009d88..1e1364f6b61e3 100644 --- a/src/llvm-late-gc-lowering-stock.cpp +++ b/src/llvm-late-gc-lowering-stock.cpp @@ -1,5 +1,13 @@ +// This file is a part of Julia. License is MIT: https://julialang.org/license + #include "llvm-gc-interface-passes.h" void LateLowerGCFrame::CleanupGCPreserve(Function &F, CallInst *CI, Value *callee, Type *T_size) { // Do nothing for the stock GC } + +Value* LateLowerGCFrame::lowerGCAllocBytesLate(CallInst *target, Function &F) +{ + // Do nothing for the stock GC + return target; +} diff --git a/src/llvm-late-gc-lowering.cpp b/src/llvm-late-gc-lowering.cpp index fc62ab5109133..841fdc2ef6332 100644 --- a/src/llvm-late-gc-lowering.cpp +++ b/src/llvm-late-gc-lowering.cpp @@ -2499,119 +2499,6 @@ void LateLowerGCFrame::PlaceRootsAndUpdateCalls(SmallVectorImpl &Colors, St } } -#ifdef MMTK_GC -Value* LateLowerGCFrame::lowerGCAllocBytesLate(CallInst *target, Function &F) -{ - assert(target->arg_size() == 3); - - IRBuilder<> builder(target); - auto ptls = target->getArgOperand(0); - auto type = target->getArgOperand(2); - if (auto CI = dyn_cast(target->getArgOperand(1))) { - size_t sz = (size_t)CI->getZExtValue(); - // This is strongly architecture and OS dependent - int osize; - int offset = jl_gc_classify_pools(sz, &osize); - if (offset >= 0) { - // In this case instead of lowering julia.gc_alloc_bytes to jl_gc_small_alloc - // We do a slowpath/fastpath check and lower it only on the slowpath, returning - // the cursor and updating it in the fastpath. - auto pool_osize_i32 = ConstantInt::get(Type::getInt32Ty(F.getContext()), osize); - auto pool_osize = ConstantInt::get(Type::getInt64Ty(F.getContext()), osize); - - // Should we generate fastpath allocation sequence here? We should always generate fastpath here for MMTk. - // Setting this to false will increase allocation overhead a lot, and should only be used for debugging. - const bool INLINE_FASTPATH_ALLOCATION = true; - - if (INLINE_FASTPATH_ALLOCATION) { - // Assuming we use the first immix allocator. - // FIXME: We should get the allocator index and type from MMTk. - auto allocator_offset = offsetof(jl_tls_states_t, gc_tls) + offsetof(jl_gc_tls_states_t, mmtk_mutator) + offsetof(MMTkMutatorContext, allocators) + offsetof(Allocators, immix); - - auto cursor_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), allocator_offset + offsetof(ImmixAllocator, cursor)); - auto limit_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), allocator_offset + offsetof(ImmixAllocator, limit)); - - auto cursor_tls_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, cursor_pos); - auto cursor_ptr = builder.CreateBitCast(cursor_tls_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "cursor_ptr"); - auto cursor = builder.CreateLoad(Type::getInt64Ty(target->getContext()), cursor_ptr, "cursor"); - - // offset = 8 - auto delta_offset = builder.CreateNSWSub(ConstantInt::get(Type::getInt64Ty(target->getContext()), 0), ConstantInt::get(Type::getInt64Ty(target->getContext()), 8)); - auto delta_cursor = builder.CreateNSWSub(ConstantInt::get(Type::getInt64Ty(target->getContext()), 0), cursor); - auto delta_op = builder.CreateNSWAdd(delta_offset, delta_cursor); - // alignment 16 (15 = 16 - 1) - auto delta = builder.CreateAnd(delta_op, ConstantInt::get(Type::getInt64Ty(target->getContext()), 15), "delta"); - auto result = builder.CreateNSWAdd(cursor, delta, "result"); - - auto new_cursor = builder.CreateNSWAdd(result, pool_osize); - - auto limit_tls_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, limit_pos); - auto limit_ptr = builder.CreateBitCast(limit_tls_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "limit_ptr"); - auto limit = builder.CreateLoad(Type::getInt64Ty(target->getContext()), limit_ptr, "limit"); - - auto gt_limit = builder.CreateICmpSGT(new_cursor, limit); - - auto slowpath = BasicBlock::Create(target->getContext(), "slowpath", target->getFunction()); - auto fastpath = BasicBlock::Create(target->getContext(), "fastpath", target->getFunction()); - - auto next_instr = target->getNextNode(); - SmallVector Weights{1, 9}; - - MDBuilder MDB(F.getContext()); - SplitBlockAndInsertIfThenElse(gt_limit, next_instr, &slowpath, &fastpath, false, false, MDB.createBranchWeights(Weights)); - - builder.SetInsertPoint(next_instr); - auto phiNode = builder.CreatePHI(target->getCalledFunction()->getReturnType(), 2, "phi_fast_slow"); - - // slowpath - builder.SetInsertPoint(slowpath); - auto pool_offs = ConstantInt::get(Type::getInt32Ty(F.getContext()), 1); - auto new_call = builder.CreateCall(smallAllocFunc, { ptls, pool_offs, pool_osize_i32, type }); - new_call->setAttributes(new_call->getCalledFunction()->getAttributes()); - builder.CreateBr(next_instr->getParent()); - - // fastpath - builder.SetInsertPoint(fastpath); - builder.CreateStore(new_cursor, cursor_ptr); - - // ptls->gc_tls.gc_num.allocd += osize; - auto pool_alloc_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), offsetof(jl_tls_states_t, gc_tls_common) + offsetof(jl_gc_tls_states_common_t, gc_num)); - auto pool_alloc_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, pool_alloc_pos); - auto pool_alloc_tls = builder.CreateBitCast(pool_alloc_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "pool_alloc"); - auto pool_allocd = builder.CreateLoad(Type::getInt64Ty(target->getContext()), pool_alloc_tls); - auto pool_allocd_total = builder.CreateAdd(pool_allocd, pool_osize); - builder.CreateStore(pool_allocd_total, pool_alloc_tls); - - auto v_raw = builder.CreateNSWAdd(result, ConstantInt::get(Type::getInt64Ty(target->getContext()), sizeof(jl_taggedvalue_t))); - auto v_as_ptr = builder.CreateIntToPtr(v_raw, smallAllocFunc->getReturnType()); - builder.CreateBr(next_instr->getParent()); - - phiNode->addIncoming(new_call, slowpath); - phiNode->addIncoming(v_as_ptr, fastpath); - phiNode->takeName(target); - return phiNode; - } - } - } - return target; -} - -template -static void replaceInstruction( - Instruction *oldInstruction, - Value *newInstruction, - TIterator &it) -{ - if (newInstruction != oldInstruction) { - oldInstruction->replaceAllUsesWith(newInstruction); - it = oldInstruction->eraseFromParent(); - } - else { - ++it; - } -} -#endif - bool LateLowerGCFrame::runOnFunction(Function &F, bool *CFGModified) { initAll(*F.getParent()); smallAllocFunc = getOrDeclare(jl_well_known::GCSmallAlloc); @@ -2630,29 +2517,29 @@ bool LateLowerGCFrame::runOnFunction(Function &F, bool *CFGModified) { PlaceRootsAndUpdateCalls(Colors, S, CallFrames); CleanupIR(F, &S, CFGModified); -#ifdef MMTK_GC - // We lower the julia.gc_alloc_bytes intrinsic in this pass to insert slowpath/fastpath blocks for MMTk - for (BasicBlock &BB : F) { - for (auto it = BB.begin(); it != BB.end();) { - auto *CI = dyn_cast(&*it); - if (!CI) { - ++it; - continue; - } - Value *callee = CI->getCalledOperand(); - assert(callee); + // We lower the julia.gc_alloc_bytes intrinsic in this pass to insert slowpath/fastpath blocks for MMTk + // For now, we do nothing for the Stock GC + auto GCAllocBytes = getOrNull(jl_intrinsics::GCAllocBytes); - auto GCAllocBytes = getOrNull(jl_intrinsics::GCAllocBytes); - if (GCAllocBytes == callee) { + if (GCAllocBytes) { + for (auto it = GCAllocBytes->user_begin(); it != GCAllocBytes->user_end(); ) { + if (auto *CI = dyn_cast(*it)) { *CFGModified = true; - replaceInstruction(CI, lowerGCAllocBytesLate(CI, F), it); - continue; + + assert(CI->getCalledOperand() == GCAllocBytes); + + auto newI = lowerGCAllocBytesLate(CI, F); + if (newI != CI) { + ++it; + CI->replaceAllUsesWith(newI); + CI->eraseFromParent(); + continue; + } } ++it; } } -#endif return true; }