Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move bigval_t struct to gc-common.h and loop through GCAllocBytes uses to apply fastpath allocation for MMTk #75

Merged
merged 2 commits into from
Dec 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -347,7 +347,7 @@ $(BUILDDIR)/debuginfo.o $(BUILDDIR)/debuginfo.dbg.obj: $(addprefix $(SRCDIR)/,de
$(BUILDDIR)/disasm.o $(BUILDDIR)/disasm.dbg.obj: $(SRCDIR)/debuginfo.h $(SRCDIR)/processor.h
$(BUILDDIR)/gc-debug.o $(BUILDDIR)/gc-debug.dbg.obj: $(SRCDIR)/gc-common.h $(SRCDIR)/gc-stock.h
$(BUILDDIR)/gc-pages.o $(BUILDDIR)/gc-pages.dbg.obj: $(SRCDIR)/gc-common.h $(SRCDIR)/gc-stock.h
$(BUILDDIR)/gc-mmtk.o $(BUILDDIR)/gc-mmtk.dbg.obj: $(SRCDIR)/gc-common.h $(SRCDIR)/gc-mmtk.h $(SRCDIR)/gc-heap-snapshot.h $(SRCDIR)/gc-alloc-profiler.h
$(BUILDDIR)/gc-mmtk.o $(BUILDDIR)/gc-mmtk.dbg.obj: $(SRCDIR)/gc-common.h $(SRCDIR)/gc-heap-snapshot.h $(SRCDIR)/gc-alloc-profiler.h
$(BUILDDIR)/gc-stacks.o $(BUILDDIR)/gc-stacks.dbg.obj: $(SRCDIR)/gc-common.h $(SRCDIR)/gc-stock.h
$(BUILDDIR)/gc-stock.o $(BUILDDIR)/gc.dbg.obj: $(SRCDIR)/gc-common.h $(SRCDIR)/gc-stock.h $(SRCDIR)/gc-heap-snapshot.h $(SRCDIR)/gc-alloc-profiler.h $(SRCDIR)/gc-page-profiler.h
$(BUILDDIR)/gc-heap-snapshot.o $(BUILDDIR)/gc-heap-snapshot.dbg.obj: $(SRCDIR)/gc-heap-snapshot.h
Expand Down
25 changes: 25 additions & 0 deletions src/gc-common.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,31 @@
extern "C" {
#endif

// =========================================================================== //
// GC Big objects
// =========================================================================== //

JL_EXTENSION typedef struct _bigval_t {
struct _bigval_t *next;
struct _bigval_t *prev;
size_t sz;
#ifdef _P64 // Add padding so that the value is 64-byte aligned
// (8 pointers of 8 bytes each) - (4 other pointers in struct)
void *_padding[8 - 4];
#else
// (16 pointers of 4 bytes each) - (4 other pointers in struct)
void *_padding[16 - 4];
#endif
//struct jl_taggedvalue_t <>;
union {
uintptr_t header;
struct {
uintptr_t gc:2;
} bits;
};
// must be 64-byte aligned here, in 32 & 64 bit modes
} bigval_t;

// =========================================================================== //
// GC Callbacks
// =========================================================================== //
Expand Down
1 change: 0 additions & 1 deletion src/gc-mmtk.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#include "gc-common.h"
#include "mmtkMutator.h"
#include "gc-mmtk.h"
#include "threading.h"

// File exists in the binding
Expand Down
30 changes: 0 additions & 30 deletions src/gc-mmtk.h

This file was deleted.

22 changes: 1 addition & 21 deletions src/gc-stock.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include "julia_internal.h"
#include "julia_assert.h"
#include "threading.h"
#include "gc-common.h"

#ifdef __cplusplus
extern "C" {
Expand Down Expand Up @@ -84,27 +85,6 @@ typedef struct _jl_gc_chunk_t {

extern uintptr_t gc_bigval_sentinel_tag;

JL_EXTENSION typedef struct _bigval_t {
struct _bigval_t *next;
struct _bigval_t *prev;
size_t sz;
#ifdef _P64 // Add padding so that the value is 64-byte aligned
// (8 pointers of 8 bytes each) - (4 other pointers in struct)
void *_padding[8 - 4];
#else
// (16 pointers of 4 bytes each) - (4 other pointers in struct)
void *_padding[16 - 4];
#endif
//struct jl_taggedvalue_t <>;
union {
uintptr_t header;
struct {
uintptr_t gc:2;
} bits;
};
// must be 64-byte aligned here, in 32 & 64 bit modes
} bigval_t;

// pool page metadata
typedef struct _jl_gc_pagemeta_t {
// next metadata structure in per-thread list
Expand Down
2 changes: 0 additions & 2 deletions src/llvm-gc-interface-passes.h
Original file line number Diff line number Diff line change
Expand Up @@ -368,9 +368,7 @@ struct LateLowerGCFrame: private JuliaPassContext {
void RefineLiveSet(LargeSparseBitVector &LS, State &S, ArrayRef<int> CalleeRoots);
Value *EmitTagPtr(IRBuilder<> &builder, Type *T, Type *T_size, Value *V);
Value *EmitLoadTag(IRBuilder<> &builder, Type *T_size, Value *V);
#ifdef MMTK_GC
Value* lowerGCAllocBytesLate(CallInst *target, Function &F);
#endif
};

// The final GC lowering pass. This pass lowers platform-agnostic GC
Expand Down
98 changes: 98 additions & 0 deletions src/llvm-late-gc-lowering-mmtk.cpp
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
// This file is a part of Julia. License is MIT: https://julialang.org/license

#include "llvm-gc-interface-passes.h"

void LateLowerGCFrame::CleanupGCPreserve(Function &F, CallInst *CI, Value *callee, Type *T_size) {
Expand Down Expand Up @@ -45,3 +47,99 @@ void LateLowerGCFrame::CleanupGCPreserve(Function &F, CallInst *CI, Value *calle
builder.CreateCall(getOrDeclare(jl_well_known::GCPreserveEndHook), {});
}
}

Value* LateLowerGCFrame::lowerGCAllocBytesLate(CallInst *target, Function &F)
{
assert(target->arg_size() == 3);

IRBuilder<> builder(target);
auto ptls = target->getArgOperand(0);
auto type = target->getArgOperand(2);
if (auto CI = dyn_cast<ConstantInt>(target->getArgOperand(1))) {
size_t sz = (size_t)CI->getZExtValue();
// This is strongly architecture and OS dependent
int osize;
int offset = jl_gc_classify_pools(sz, &osize);
if (offset >= 0) {
// In this case instead of lowering julia.gc_alloc_bytes to jl_gc_small_alloc
// We do a slowpath/fastpath check and lower it only on the slowpath, returning
// the cursor and updating it in the fastpath.
auto pool_osize_i32 = ConstantInt::get(Type::getInt32Ty(F.getContext()), osize);
auto pool_osize = ConstantInt::get(Type::getInt64Ty(F.getContext()), osize);

// Should we generate fastpath allocation sequence here? We should always generate fastpath here for MMTk.
// Setting this to false will increase allocation overhead a lot, and should only be used for debugging.
const bool INLINE_FASTPATH_ALLOCATION = true;

if (INLINE_FASTPATH_ALLOCATION) {
// Assuming we use the first immix allocator.
// FIXME: We should get the allocator index and type from MMTk.
auto allocator_offset = offsetof(jl_tls_states_t, gc_tls) + offsetof(jl_gc_tls_states_t, mmtk_mutator) + offsetof(MMTkMutatorContext, allocators) + offsetof(Allocators, immix);

auto cursor_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), allocator_offset + offsetof(ImmixAllocator, cursor));
auto limit_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), allocator_offset + offsetof(ImmixAllocator, limit));

auto cursor_tls_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, cursor_pos);
auto cursor_ptr = builder.CreateBitCast(cursor_tls_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "cursor_ptr");
auto cursor = builder.CreateLoad(Type::getInt64Ty(target->getContext()), cursor_ptr, "cursor");

// offset = 8
auto delta_offset = builder.CreateNSWSub(ConstantInt::get(Type::getInt64Ty(target->getContext()), 0), ConstantInt::get(Type::getInt64Ty(target->getContext()), 8));
auto delta_cursor = builder.CreateNSWSub(ConstantInt::get(Type::getInt64Ty(target->getContext()), 0), cursor);
auto delta_op = builder.CreateNSWAdd(delta_offset, delta_cursor);
// alignment 16 (15 = 16 - 1)
auto delta = builder.CreateAnd(delta_op, ConstantInt::get(Type::getInt64Ty(target->getContext()), 15), "delta");
auto result = builder.CreateNSWAdd(cursor, delta, "result");

auto new_cursor = builder.CreateNSWAdd(result, pool_osize);

auto limit_tls_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, limit_pos);
auto limit_ptr = builder.CreateBitCast(limit_tls_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "limit_ptr");
auto limit = builder.CreateLoad(Type::getInt64Ty(target->getContext()), limit_ptr, "limit");

auto gt_limit = builder.CreateICmpSGT(new_cursor, limit);

auto slowpath = BasicBlock::Create(target->getContext(), "slowpath", target->getFunction());
auto fastpath = BasicBlock::Create(target->getContext(), "fastpath", target->getFunction());

auto next_instr = target->getNextNode();
SmallVector<uint32_t, 2> Weights{1, 9};

MDBuilder MDB(F.getContext());
SplitBlockAndInsertIfThenElse(gt_limit, next_instr, &slowpath, &fastpath, false, false, MDB.createBranchWeights(Weights));

builder.SetInsertPoint(next_instr);
auto phiNode = builder.CreatePHI(target->getCalledFunction()->getReturnType(), 2, "phi_fast_slow");

// slowpath
builder.SetInsertPoint(slowpath);
auto pool_offs = ConstantInt::get(Type::getInt32Ty(F.getContext()), 1);
auto new_call = builder.CreateCall(smallAllocFunc, { ptls, pool_offs, pool_osize_i32, type });
new_call->setAttributes(new_call->getCalledFunction()->getAttributes());
builder.CreateBr(next_instr->getParent());

// fastpath
builder.SetInsertPoint(fastpath);
builder.CreateStore(new_cursor, cursor_ptr);

// ptls->gc_tls.gc_num.allocd += osize;
auto pool_alloc_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), offsetof(jl_tls_states_t, gc_tls_common) + offsetof(jl_gc_tls_states_common_t, gc_num));
auto pool_alloc_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, pool_alloc_pos);
auto pool_alloc_tls = builder.CreateBitCast(pool_alloc_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "pool_alloc");
auto pool_allocd = builder.CreateLoad(Type::getInt64Ty(target->getContext()), pool_alloc_tls);
auto pool_allocd_total = builder.CreateAdd(pool_allocd, pool_osize);
builder.CreateStore(pool_allocd_total, pool_alloc_tls);

auto v_raw = builder.CreateNSWAdd(result, ConstantInt::get(Type::getInt64Ty(target->getContext()), sizeof(jl_taggedvalue_t)));
auto v_as_ptr = builder.CreateIntToPtr(v_raw, smallAllocFunc->getReturnType());
builder.CreateBr(next_instr->getParent());

phiNode->addIncoming(new_call, slowpath);
phiNode->addIncoming(v_as_ptr, fastpath);
phiNode->takeName(target);
return phiNode;
}
}
}
return target;
}
8 changes: 8 additions & 0 deletions src/llvm-late-gc-lowering-stock.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,13 @@
// This file is a part of Julia. License is MIT: https://julialang.org/license

#include "llvm-gc-interface-passes.h"

void LateLowerGCFrame::CleanupGCPreserve(Function &F, CallInst *CI, Value *callee, Type *T_size) {
// Do nothing for the stock GC
}

Value* LateLowerGCFrame::lowerGCAllocBytesLate(CallInst *target, Function &F)
{
// Do nothing for the stock GC
return target;
}
Loading