Skip to content

Commit

Permalink
Restructuring lowerGCAllocBytesLate pass
Browse files Browse the repository at this point in the history
  • Loading branch information
udesou committed Dec 2, 2024
1 parent 81d2cb3 commit f81e7de
Show file tree
Hide file tree
Showing 4 changed files with 122 additions and 131 deletions.
2 changes: 0 additions & 2 deletions src/llvm-gc-interface-passes.h
Original file line number Diff line number Diff line change
Expand Up @@ -368,9 +368,7 @@ struct LateLowerGCFrame: private JuliaPassContext {
void RefineLiveSet(LargeSparseBitVector &LS, State &S, ArrayRef<int> CalleeRoots);
Value *EmitTagPtr(IRBuilder<> &builder, Type *T, Type *T_size, Value *V);
Value *EmitLoadTag(IRBuilder<> &builder, Type *T_size, Value *V);
#ifdef MMTK_GC
Value* lowerGCAllocBytesLate(CallInst *target, Function &F);
#endif
};

// The final GC lowering pass. This pass lowers platform-agnostic GC
Expand Down
98 changes: 98 additions & 0 deletions src/llvm-late-gc-lowering-mmtk.cpp
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
// This file is a part of Julia. License is MIT: https://julialang.org/license

#include "llvm-gc-interface-passes.h"

void LateLowerGCFrame::CleanupGCPreserve(Function &F, CallInst *CI, Value *callee, Type *T_size) {
Expand Down Expand Up @@ -45,3 +47,99 @@ void LateLowerGCFrame::CleanupGCPreserve(Function &F, CallInst *CI, Value *calle
builder.CreateCall(getOrDeclare(jl_well_known::GCPreserveEndHook), {});
}
}

Value* LateLowerGCFrame::lowerGCAllocBytesLate(CallInst *target, Function &F)
{
assert(target->arg_size() == 3);

IRBuilder<> builder(target);
auto ptls = target->getArgOperand(0);
auto type = target->getArgOperand(2);
if (auto CI = dyn_cast<ConstantInt>(target->getArgOperand(1))) {
size_t sz = (size_t)CI->getZExtValue();
// This is strongly architecture and OS dependent
int osize;
int offset = jl_gc_classify_pools(sz, &osize);
if (offset >= 0) {
// In this case instead of lowering julia.gc_alloc_bytes to jl_gc_small_alloc
// We do a slowpath/fastpath check and lower it only on the slowpath, returning
// the cursor and updating it in the fastpath.
auto pool_osize_i32 = ConstantInt::get(Type::getInt32Ty(F.getContext()), osize);
auto pool_osize = ConstantInt::get(Type::getInt64Ty(F.getContext()), osize);

// Should we generate fastpath allocation sequence here? We should always generate fastpath here for MMTk.
// Setting this to false will increase allocation overhead a lot, and should only be used for debugging.
const bool INLINE_FASTPATH_ALLOCATION = true;

if (INLINE_FASTPATH_ALLOCATION) {
// Assuming we use the first immix allocator.
// FIXME: We should get the allocator index and type from MMTk.
auto allocator_offset = offsetof(jl_tls_states_t, gc_tls) + offsetof(jl_gc_tls_states_t, mmtk_mutator) + offsetof(MMTkMutatorContext, allocators) + offsetof(Allocators, immix);

auto cursor_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), allocator_offset + offsetof(ImmixAllocator, cursor));
auto limit_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), allocator_offset + offsetof(ImmixAllocator, limit));

auto cursor_tls_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, cursor_pos);
auto cursor_ptr = builder.CreateBitCast(cursor_tls_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "cursor_ptr");
auto cursor = builder.CreateLoad(Type::getInt64Ty(target->getContext()), cursor_ptr, "cursor");

// offset = 8
auto delta_offset = builder.CreateNSWSub(ConstantInt::get(Type::getInt64Ty(target->getContext()), 0), ConstantInt::get(Type::getInt64Ty(target->getContext()), 8));
auto delta_cursor = builder.CreateNSWSub(ConstantInt::get(Type::getInt64Ty(target->getContext()), 0), cursor);
auto delta_op = builder.CreateNSWAdd(delta_offset, delta_cursor);
// alignment 16 (15 = 16 - 1)
auto delta = builder.CreateAnd(delta_op, ConstantInt::get(Type::getInt64Ty(target->getContext()), 15), "delta");
auto result = builder.CreateNSWAdd(cursor, delta, "result");

auto new_cursor = builder.CreateNSWAdd(result, pool_osize);

auto limit_tls_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, limit_pos);
auto limit_ptr = builder.CreateBitCast(limit_tls_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "limit_ptr");
auto limit = builder.CreateLoad(Type::getInt64Ty(target->getContext()), limit_ptr, "limit");

auto gt_limit = builder.CreateICmpSGT(new_cursor, limit);

auto slowpath = BasicBlock::Create(target->getContext(), "slowpath", target->getFunction());
auto fastpath = BasicBlock::Create(target->getContext(), "fastpath", target->getFunction());

auto next_instr = target->getNextNode();
SmallVector<uint32_t, 2> Weights{1, 9};

MDBuilder MDB(F.getContext());
SplitBlockAndInsertIfThenElse(gt_limit, next_instr, &slowpath, &fastpath, false, false, MDB.createBranchWeights(Weights));

builder.SetInsertPoint(next_instr);
auto phiNode = builder.CreatePHI(target->getCalledFunction()->getReturnType(), 2, "phi_fast_slow");

// slowpath
builder.SetInsertPoint(slowpath);
auto pool_offs = ConstantInt::get(Type::getInt32Ty(F.getContext()), 1);
auto new_call = builder.CreateCall(smallAllocFunc, { ptls, pool_offs, pool_osize_i32, type });
new_call->setAttributes(new_call->getCalledFunction()->getAttributes());
builder.CreateBr(next_instr->getParent());

// fastpath
builder.SetInsertPoint(fastpath);
builder.CreateStore(new_cursor, cursor_ptr);

// ptls->gc_tls.gc_num.allocd += osize;
auto pool_alloc_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), offsetof(jl_tls_states_t, gc_tls_common) + offsetof(jl_gc_tls_states_common_t, gc_num));
auto pool_alloc_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, pool_alloc_pos);
auto pool_alloc_tls = builder.CreateBitCast(pool_alloc_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "pool_alloc");
auto pool_allocd = builder.CreateLoad(Type::getInt64Ty(target->getContext()), pool_alloc_tls);
auto pool_allocd_total = builder.CreateAdd(pool_allocd, pool_osize);
builder.CreateStore(pool_allocd_total, pool_alloc_tls);

auto v_raw = builder.CreateNSWAdd(result, ConstantInt::get(Type::getInt64Ty(target->getContext()), sizeof(jl_taggedvalue_t)));
auto v_as_ptr = builder.CreateIntToPtr(v_raw, smallAllocFunc->getReturnType());
builder.CreateBr(next_instr->getParent());

phiNode->addIncoming(new_call, slowpath);
phiNode->addIncoming(v_as_ptr, fastpath);
phiNode->takeName(target);
return phiNode;
}
}
}
return target;
}
8 changes: 8 additions & 0 deletions src/llvm-late-gc-lowering-stock.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,13 @@
// This file is a part of Julia. License is MIT: https://julialang.org/license

#include "llvm-gc-interface-passes.h"

void LateLowerGCFrame::CleanupGCPreserve(Function &F, CallInst *CI, Value *callee, Type *T_size) {
// Do nothing for the stock GC
}

Value* LateLowerGCFrame::lowerGCAllocBytesLate(CallInst *target, Function &F)
{
// Do nothing for the stock GC
return target;
}
145 changes: 16 additions & 129 deletions src/llvm-late-gc-lowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2499,119 +2499,6 @@ void LateLowerGCFrame::PlaceRootsAndUpdateCalls(SmallVectorImpl<int> &Colors, St
}
}

#ifdef MMTK_GC
Value* LateLowerGCFrame::lowerGCAllocBytesLate(CallInst *target, Function &F)
{
assert(target->arg_size() == 3);

IRBuilder<> builder(target);
auto ptls = target->getArgOperand(0);
auto type = target->getArgOperand(2);
if (auto CI = dyn_cast<ConstantInt>(target->getArgOperand(1))) {
size_t sz = (size_t)CI->getZExtValue();
// This is strongly architecture and OS dependent
int osize;
int offset = jl_gc_classify_pools(sz, &osize);
if (offset >= 0) {
// In this case instead of lowering julia.gc_alloc_bytes to jl_gc_small_alloc
// We do a slowpath/fastpath check and lower it only on the slowpath, returning
// the cursor and updating it in the fastpath.
auto pool_osize_i32 = ConstantInt::get(Type::getInt32Ty(F.getContext()), osize);
auto pool_osize = ConstantInt::get(Type::getInt64Ty(F.getContext()), osize);

// Should we generate fastpath allocation sequence here? We should always generate fastpath here for MMTk.
// Setting this to false will increase allocation overhead a lot, and should only be used for debugging.
const bool INLINE_FASTPATH_ALLOCATION = true;

if (INLINE_FASTPATH_ALLOCATION) {
// Assuming we use the first immix allocator.
// FIXME: We should get the allocator index and type from MMTk.
auto allocator_offset = offsetof(jl_tls_states_t, gc_tls) + offsetof(jl_gc_tls_states_t, mmtk_mutator) + offsetof(MMTkMutatorContext, allocators) + offsetof(Allocators, immix);

auto cursor_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), allocator_offset + offsetof(ImmixAllocator, cursor));
auto limit_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), allocator_offset + offsetof(ImmixAllocator, limit));

auto cursor_tls_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, cursor_pos);
auto cursor_ptr = builder.CreateBitCast(cursor_tls_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "cursor_ptr");
auto cursor = builder.CreateLoad(Type::getInt64Ty(target->getContext()), cursor_ptr, "cursor");

// offset = 8
auto delta_offset = builder.CreateNSWSub(ConstantInt::get(Type::getInt64Ty(target->getContext()), 0), ConstantInt::get(Type::getInt64Ty(target->getContext()), 8));
auto delta_cursor = builder.CreateNSWSub(ConstantInt::get(Type::getInt64Ty(target->getContext()), 0), cursor);
auto delta_op = builder.CreateNSWAdd(delta_offset, delta_cursor);
// alignment 16 (15 = 16 - 1)
auto delta = builder.CreateAnd(delta_op, ConstantInt::get(Type::getInt64Ty(target->getContext()), 15), "delta");
auto result = builder.CreateNSWAdd(cursor, delta, "result");

auto new_cursor = builder.CreateNSWAdd(result, pool_osize);

auto limit_tls_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, limit_pos);
auto limit_ptr = builder.CreateBitCast(limit_tls_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "limit_ptr");
auto limit = builder.CreateLoad(Type::getInt64Ty(target->getContext()), limit_ptr, "limit");

auto gt_limit = builder.CreateICmpSGT(new_cursor, limit);

auto slowpath = BasicBlock::Create(target->getContext(), "slowpath", target->getFunction());
auto fastpath = BasicBlock::Create(target->getContext(), "fastpath", target->getFunction());

auto next_instr = target->getNextNode();
SmallVector<uint32_t, 2> Weights{1, 9};

MDBuilder MDB(F.getContext());
SplitBlockAndInsertIfThenElse(gt_limit, next_instr, &slowpath, &fastpath, false, false, MDB.createBranchWeights(Weights));

builder.SetInsertPoint(next_instr);
auto phiNode = builder.CreatePHI(target->getCalledFunction()->getReturnType(), 2, "phi_fast_slow");

// slowpath
builder.SetInsertPoint(slowpath);
auto pool_offs = ConstantInt::get(Type::getInt32Ty(F.getContext()), 1);
auto new_call = builder.CreateCall(smallAllocFunc, { ptls, pool_offs, pool_osize_i32, type });
new_call->setAttributes(new_call->getCalledFunction()->getAttributes());
builder.CreateBr(next_instr->getParent());

// fastpath
builder.SetInsertPoint(fastpath);
builder.CreateStore(new_cursor, cursor_ptr);

// ptls->gc_tls.gc_num.allocd += osize;
auto pool_alloc_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), offsetof(jl_tls_states_t, gc_tls_common) + offsetof(jl_gc_tls_states_common_t, gc_num));
auto pool_alloc_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, pool_alloc_pos);
auto pool_alloc_tls = builder.CreateBitCast(pool_alloc_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "pool_alloc");
auto pool_allocd = builder.CreateLoad(Type::getInt64Ty(target->getContext()), pool_alloc_tls);
auto pool_allocd_total = builder.CreateAdd(pool_allocd, pool_osize);
builder.CreateStore(pool_allocd_total, pool_alloc_tls);

auto v_raw = builder.CreateNSWAdd(result, ConstantInt::get(Type::getInt64Ty(target->getContext()), sizeof(jl_taggedvalue_t)));
auto v_as_ptr = builder.CreateIntToPtr(v_raw, smallAllocFunc->getReturnType());
builder.CreateBr(next_instr->getParent());

phiNode->addIncoming(new_call, slowpath);
phiNode->addIncoming(v_as_ptr, fastpath);
phiNode->takeName(target);
return phiNode;
}
}
}
return target;
}

template<typename TIterator>
static void replaceInstruction(
Instruction *oldInstruction,
Value *newInstruction,
TIterator &it)
{
if (newInstruction != oldInstruction) {
oldInstruction->replaceAllUsesWith(newInstruction);
it = oldInstruction->eraseFromParent();
}
else {
++it;
}
}
#endif

bool LateLowerGCFrame::runOnFunction(Function &F, bool *CFGModified) {
initAll(*F.getParent());
smallAllocFunc = getOrDeclare(jl_well_known::GCSmallAlloc);
Expand All @@ -2630,29 +2517,29 @@ bool LateLowerGCFrame::runOnFunction(Function &F, bool *CFGModified) {
PlaceRootsAndUpdateCalls(Colors, S, CallFrames);
CleanupIR(F, &S, CFGModified);

#ifdef MMTK_GC
// We lower the julia.gc_alloc_bytes intrinsic in this pass to insert slowpath/fastpath blocks for MMTk
for (BasicBlock &BB : F) {
for (auto it = BB.begin(); it != BB.end();) {
auto *CI = dyn_cast<CallInst>(&*it);
if (!CI) {
++it;
continue;
}

Value *callee = CI->getCalledOperand();
assert(callee);
// We lower the julia.gc_alloc_bytes intrinsic in this pass to insert slowpath/fastpath blocks for MMTk
// For now, we do nothing for the Stock GC
auto GCAllocBytes = getOrNull(jl_intrinsics::GCAllocBytes);

auto GCAllocBytes = getOrNull(jl_intrinsics::GCAllocBytes);
if (GCAllocBytes == callee) {
if (GCAllocBytes) {
for (auto it = GCAllocBytes->user_begin(); it != GCAllocBytes->user_end(); ) {
if (auto *CI = dyn_cast<CallInst>(*it)) {
*CFGModified = true;
replaceInstruction(CI, lowerGCAllocBytesLate(CI, F), it);
continue;

assert(CI->getCalledOperand() == GCAllocBytes);

auto newI = lowerGCAllocBytesLate(CI, F);
if (newI != CI) {
++it;
CI->replaceAllUsesWith(newI);
CI->eraseFromParent();
continue;
}
}
++it;
}
}
#endif

return true;
}
Expand Down

0 comments on commit f81e7de

Please sign in to comment.