Skip to content

Commit

Permalink
codegen: fix optimization misses from IR changes
Browse files Browse the repository at this point in the history
- dereferenceable attribute requires align attribute (if eltype is not
sized) or it is simply ignored

- tbaa_const isn't necessarily sufficient to allow LICM hoisting. an
unknown call instruction (such as julia.gc_alloc_obj) present in the
same loop can prevent TBAA from declaring the memory to be immutable.
However, the invariant.load attribute is much stronger:
additionally mark all tbaa_const loads with MD_immutable_load,
permitting llvm to reorder and left them them much more freely
(requiring only dereferenceable).
  • Loading branch information
vtjnash committed Jul 9, 2018
1 parent 808f266 commit d803472
Show file tree
Hide file tree
Showing 6 changed files with 54 additions and 136 deletions.
56 changes: 27 additions & 29 deletions src/cgutils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@

static Instruction *tbaa_decorate(MDNode *md, Instruction *load_or_store)
{
load_or_store->setMetadata( llvm::LLVMContext::MD_tbaa, md );
load_or_store->setMetadata(llvm::LLVMContext::MD_tbaa, md);
if (isa<LoadInst>(load_or_store) && md == tbaa_const)
load_or_store->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(md->getContext(), None));
return load_or_store;
}

Expand Down Expand Up @@ -358,20 +360,29 @@ static size_t dereferenceable_size(jl_value_t *jt)
}
}

// If given alignment is 0 and LLVM's assumed alignment for a load/store via ptr

This comment has been minimized.

Copy link
@KristofferC

KristofferC Jul 10, 2018

Member

Is this comment stale now?

This comment has been minimized.

Copy link
@vtjnash

vtjnash Jul 10, 2018

Author Member

yes, thanks

// might be stricter than the Julia alignment for jltype, return the alignment of jltype.
// Otherwise return the given alignment.
static unsigned julia_alignment(jl_value_t *jltype)
{
unsigned alignment = jl_datatype_align(jltype);
assert(alignment <= JL_HEAP_ALIGNMENT);
assert(JL_HEAP_ALIGNMENT % alignment == 0);
return alignment;
}

static inline void maybe_mark_argument_dereferenceable(Argument *A, jl_value_t *jt)
{
auto F = A->getParent();
AttrBuilder B;
B.addAttribute(Attribute::NonNull);
// The `dereferencable` below does not imply `nonnull` for non addrspace(0) pointers.
#if JL_LLVM_VERSION >= 50000
F->addParamAttr(A->getArgNo(), Attribute::NonNull);
#else
F->setAttributes(F->getAttributes().addAttribute(jl_LLVMContext, A->getArgNo() + 1,
Attribute::NonNull));
#endif
size_t size = dereferenceable_size(jt);
if (!size)
return;
F->addDereferenceableAttr(A->getArgNo() + 1, size);
if (size) {
B.addDereferenceableAttr(size);
if (!A->getType()->getPointerElementType()->isSized()) // mimic LLVM Loads.cpp isAligned
B.addAlignmentAttr(julia_alignment(jt));
}
A->addAttrs(B);
}

static inline Instruction *maybe_mark_load_dereferenceable(Instruction *LI, bool can_be_null,
Expand Down Expand Up @@ -1229,19 +1240,6 @@ static Value *emit_bounds_check(jl_codectx_t &ctx, const jl_cgval_t &ainfo, jl_v
return im1;
}

// If given alignment is 0 and LLVM's assumed alignment for a load/store via ptr
// might be stricter than the Julia alignment for jltype, return the alignment of jltype.
// Otherwise return the given alignment.
static unsigned julia_alignment(jl_value_t *jltype, unsigned alignment)
{
if (!alignment) {
alignment = jl_datatype_align(jltype);
assert(alignment <= JL_HEAP_ALIGNMENT);
assert(JL_HEAP_ALIGNMENT % alignment == 0);
}
return alignment;
}

static Value *emit_unbox(jl_codectx_t &ctx, Type *to, const jl_cgval_t &x, jl_value_t *jt, Value* dest = NULL, MDNode *tbaa_dest = nullptr, bool isVolatile = false);

static jl_cgval_t typed_load(jl_codectx_t &ctx, Value *ptr, Value *idx_0based, jl_value_t *jltype,
Expand All @@ -1267,8 +1265,8 @@ static jl_cgval_t typed_load(jl_codectx_t &ctx, Value *ptr, Value *idx_0based, j
// elt = data;
//}
//else {
Instruction *load = ctx.builder.CreateAlignedLoad(data, isboxed ?
alignment : julia_alignment(jltype, alignment), false);
Instruction *load = ctx.builder.CreateAlignedLoad(data, isboxed || alignment ?
alignment : julia_alignment(jltype), false);
if (isboxed)
load = maybe_mark_load_dereferenceable(load, true, jltype);
if (tbaa) {
Expand Down Expand Up @@ -1316,7 +1314,7 @@ static void typed_store(jl_codectx_t &ctx,
}
if (idx_0based)
data = ctx.builder.CreateInBoundsGEP(r->getType(), data, idx_0based);
Instruction *store = ctx.builder.CreateAlignedStore(r, data, isboxed ? alignment : julia_alignment(jltype, alignment));
Instruction *store = ctx.builder.CreateAlignedStore(r, data, isboxed || alignment ? alignment : julia_alignment(jltype));
if (tbaa)
tbaa_decorate(tbaa, store);
}
Expand Down Expand Up @@ -2229,7 +2227,7 @@ static void emit_unionmove(jl_codectx_t &ctx, Value *dest, MDNode *tbaa_dst, con
else {
Value *src_ptr = data_pointer(ctx, src);
unsigned nb = jl_datatype_size(typ);
unsigned alignment = julia_alignment(typ, 0);
unsigned alignment = julia_alignment(typ);
Value *nbytes = ConstantInt::get(T_size, nb);
if (skip) {
// TODO: this Select is very bad for performance, but is necessary to work around LLVM bugs with the undef option that we want to use:
Expand All @@ -2256,7 +2254,7 @@ static void emit_unionmove(jl_codectx_t &ctx, Value *dest, MDNode *tbaa_dst, con
bool allunboxed = for_each_uniontype_small(
[&](unsigned idx, jl_datatype_t *jt) {
unsigned nb = jl_datatype_size(jt);
unsigned alignment = julia_alignment((jl_value_t*)jt, 0);
unsigned alignment = julia_alignment((jl_value_t*)jt);
BasicBlock *tempBB = BasicBlock::Create(jl_LLVMContext, "union_move", ctx.f);
ctx.builder.SetInsertPoint(tempBB);
switchInst->addCase(ConstantInt::get(T_int8, idx), tempBB);
Expand Down
8 changes: 4 additions & 4 deletions src/codegen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4186,12 +4186,12 @@ static void emit_cfunc_invalidate(
}
else {
gf_ret = emit_bitcast(ctx, gf_ret, gfrt->getPointerTo());
ctx.builder.CreateRet(ctx.builder.CreateAlignedLoad(gf_ret, julia_alignment(astrt, 0)));
ctx.builder.CreateRet(ctx.builder.CreateAlignedLoad(gf_ret, julia_alignment(astrt)));
}
break;
}
case jl_returninfo_t::SRet: {
emit_memcpy(ctx, &*gf_thunk->arg_begin(), nullptr, gf_ret, nullptr, jl_datatype_size(astrt), julia_alignment(astrt, 0));
emit_memcpy(ctx, &*gf_thunk->arg_begin(), nullptr, gf_ret, nullptr, jl_datatype_size(astrt), julia_alignment(astrt));
ctx.builder.CreateRetVoid();
break;
}
Expand Down Expand Up @@ -5038,7 +5038,7 @@ static Function *gen_invoke_wrapper(jl_method_instance_t *lam, const jl_returnin
if (lty != NULL && !isboxed) {
theArg = decay_derived(emit_bitcast(ctx, theArg, PointerType::get(lty, 0)));
if (!lty->isAggregateType()) // keep "aggregate" type values in place as pointers
theArg = ctx.builder.CreateAlignedLoad(theArg, julia_alignment(ty, 0));
theArg = ctx.builder.CreateAlignedLoad(theArg, julia_alignment(ty));
}
assert(dyn_cast<UndefValue>(theArg) == NULL);
args[idx] = theArg;
Expand Down Expand Up @@ -6134,7 +6134,7 @@ static std::unique_ptr<Module> emit_function(
if (returninfo.cc == jl_returninfo_t::SRet) {
assert(jl_is_concrete_type(jlrettype));
emit_memcpy(ctx, sret, nullptr, retvalinfo, jl_datatype_size(jlrettype),
julia_alignment(jlrettype, 0));
julia_alignment(jlrettype));
}
else { // must be jl_returninfo_t::Union
emit_unionmove(ctx, sret, nullptr, retvalinfo, /*skip*/isboxed_union);
Expand Down
2 changes: 1 addition & 1 deletion src/intrinsics.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -367,7 +367,7 @@ static Value *emit_unbox(jl_codectx_t &ctx, Type *to, const jl_cgval_t &x, jl_va
return NULL;
}

unsigned alignment = julia_alignment(jt, 0);
unsigned alignment = julia_alignment(jt);
Type *ptype = to->getPointerTo();
if (dest) {
emit_memcpy(ctx, dest, tbaa_dest, p, x.tbaa, jl_datatype_size(jt), alignment, false);
Expand Down
116 changes: 16 additions & 100 deletions src/llvm-late-gc-lowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,49 +31,6 @@

using namespace llvm;

namespace {
#if JL_LLVM_VERSION < 50000
static void appendToUsedList(Module &M, StringRef Name, ArrayRef<GlobalValue *> Values) {
GlobalVariable *GV = M.getGlobalVariable(Name);
SmallPtrSet<Constant *, 16> InitAsSet;
SmallVector<Constant *, 16> Init;
if (GV) {
ConstantArray *CA = dyn_cast<ConstantArray>(GV->getInitializer());
for (auto &Op : CA->operands()) {
Constant *C = cast_or_null<Constant>(Op);
if (InitAsSet.insert(C).second)
Init.push_back(C);
}
GV->eraseFromParent();
}

Type *Int8PtrTy = llvm::Type::getInt8PtrTy(M.getContext());
for (auto *V : Values) {
Constant *C = ConstantExpr::getBitCast(V, Int8PtrTy);
if (InitAsSet.insert(C).second)
Init.push_back(C);
}

if (Init.empty())
return;

ArrayType *ATy = ArrayType::get(Int8PtrTy, Init.size());
GV = new llvm::GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage,
ConstantArray::get(ATy, Init), Name);
GV->setSection("llvm.metadata");
}

static void append_to_compiler_used(Module &M, ArrayRef<GlobalValue *> Values) {
appendToUsedList(M, "llvm.compiler.used", Values);
}
#else
static void append_to_compiler_used(Module &M, ArrayRef<GlobalValue *> Values)
{
appendToCompilerUsed(M, Values);
}
#endif
}

/* Julia GC Root Placement pass. For a general overview of the design of GC
root lowering, see the devdocs. This file is the actual implementation.
Expand Down Expand Up @@ -340,16 +297,6 @@ namespace llvm {
void initializeLateLowerGCFramePass(PassRegistry &Registry);
}

template<typename T>
static void addReturnAttr(T *f, Attribute::AttrKind Kind)
{
#if JL_LLVM_VERSION >= 50000
f->addAttribute(AttributeList::ReturnIndex, Kind);
#else
f->addAttribute(AttributeSet::ReturnIndex, Kind);
#endif
}

extern std::pair<MDNode*,MDNode*> tbaa_make_child(const char *name, MDNode *parent=nullptr, bool isConstant=false);
struct LateLowerGCFrame: public FunctionPass {
static char ID;
Expand Down Expand Up @@ -822,6 +769,8 @@ JL_USED_FUNC static void dumpLivenessState(Function &F, State &S) {
// jtbaa_immut.
static bool isLoadFromImmut(LoadInst *LI)
{
if (LI->getMetadata(LLVMContext::MD_invariant_load))
return true;
MDNode *TBAA = LI->getMetadata(LLVMContext::MD_tbaa);
if (!TBAA)
return false;
Expand Down Expand Up @@ -1573,7 +1522,6 @@ Value *LateLowerGCFrame::EmitLoadTag(IRBuilder<> &builder, Value *V)
// that's initialized by `addrspacecast`. Such a global variable is not supported by the backend.
// This is not a problem on 4.0+ since that transformation (in loop-idiom) is disabled
// for NI pointers.
#if JL_LLVM_VERSION >= 40000
static SmallVector<int, 1> *FindRefinements(Value *V, State *S)
{
if (!S)
Expand All @@ -1593,12 +1541,6 @@ static bool IsPermRooted(Value *V, State *S)
return RefinePtr->size() == 1 && (*RefinePtr)[0] == -2;
return false;
}
#else
static bool IsPermRooted(Value *V, State *S)
{
return false;
}
#endif

static inline void UpdatePtrNumbering(Value *From, Value *To, State *S)
{
Expand All @@ -1625,11 +1567,8 @@ bool LateLowerGCFrame::CleanupIR(Function &F, State *S) {
AllocaInst *Frame = nullptr;
if (T_prjlvalue) {
T_pprjlvalue = T_prjlvalue->getPointerTo();
Frame = new AllocaInst(T_prjlvalue,
#if JL_LLVM_VERSION >= 50000
0,
#endif
ConstantInt::get(T_int32, maxframeargs), "", StartOff);
Frame = new AllocaInst(T_prjlvalue, 0,
ConstantInt::get(T_int32, maxframeargs), "", StartOff);
}
SmallVector<CallInst*, 16> write_barriers;
for (BasicBlock &BB : F) {
Expand Down Expand Up @@ -1670,8 +1609,7 @@ bool LateLowerGCFrame::CleanupIR(Function &F, State *S) {
auto pool_osize = ConstantInt::get(T_int32, osize);
newI = builder.CreateCall(pool_alloc_func, {ptls, pool_offs, pool_osize});
}
addReturnAttr(newI, Attribute::NoAlias);
addReturnAttr(newI, Attribute::NonNull);
newI->setAttributes(newI->getCalledFunction()->getAttributes());
newI->takeName(CI);
auto store = builder.CreateStore(CI->getArgOperand(2),
EmitTagPtr(builder, T_prjlvalue, newI));
Expand Down Expand Up @@ -1726,26 +1664,10 @@ bool LateLowerGCFrame::CleanupIR(Function &F, State *S) {
CallInst *NewCall = CallInst::Create(newFptr, ReplacementArgs, "", CI);
NewCall->setTailCallKind(CI->getTailCallKind());
auto old_attrs = CI->getAttributes();
#if JL_LLVM_VERSION >= 50000
NewCall->setAttributes(AttributeList::get(CI->getContext(),
old_attrs.getFnAttributes(),
old_attrs.getRetAttributes(), {}));
#else
AttributeSet attr;
attr = attr.addAttributes(CI->getContext(), AttributeSet::ReturnIndex,
old_attrs.getRetAttributes())
.addAttributes(CI->getContext(), AttributeSet::FunctionIndex,
old_attrs.getFnAttributes());
NewCall->setAttributes(attr);
#endif
#if JL_LLVM_VERSION >= 40000
NewCall->copyMetadata(*CI);
#else
SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
CI->getAllMetadata(MDs);
for (auto MD : MDs)
NewCall->setMetadata(MD.first, MD.second);
#endif
CI->replaceAllUsesWith(NewCall);
UpdatePtrNumbering(CI, NewCall, S);
} else if (CI->getNumArgOperands() == CI->getNumOperands()) {
Expand All @@ -1755,14 +1677,7 @@ bool LateLowerGCFrame::CleanupIR(Function &F, State *S) {
} else {
CallInst *NewCall = CallInst::Create(CI, None, CI);
NewCall->takeName(CI);
#if JL_LLVM_VERSION >= 40000
NewCall->copyMetadata(*CI);
#else
SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
CI->getAllMetadata(MDs);
for (auto MD : MDs)
NewCall->setMetadata(MD.first, MD.second);
#endif
CI->replaceAllUsesWith(NewCall);
UpdatePtrNumbering(CI, NewCall, S);
}
Expand Down Expand Up @@ -1910,11 +1825,8 @@ void LateLowerGCFrame::PlaceRootsAndUpdateCalls(std::vector<int> &Colors, State
if (MaxColor != -1 || S.Allocas.size() != 0) {
unsigned NRoots = MaxColor + 1 + S.Allocas.size();
// Create GC Frame
AllocaInst *gcframe = new AllocaInst(T_prjlvalue,
#if JL_LLVM_VERSION >= 50000
0,
#endif
ConstantInt::get(T_int32, NRoots+2), "gcframe");
AllocaInst *gcframe = new AllocaInst(T_prjlvalue, 0,
ConstantInt::get(T_int32, NRoots + 2), "gcframe");
gcframe->insertBefore(&*F->getEntryBlock().begin());
// Zero out gcframe
BitCastInst *tempSlot_i8 = new BitCastInst(gcframe, Type::getInt8PtrTy(F->getContext()), "");
Expand Down Expand Up @@ -2009,17 +1921,21 @@ bool LateLowerGCFrame::doInitialization(Module &M) {
args.push_back(T_int32);
pool_alloc_func = Function::Create(FunctionType::get(T_prjlvalue, args, false),
Function::ExternalLinkage, "jl_gc_pool_alloc", &M);
addReturnAttr(pool_alloc_func, Attribute::NoAlias);
addReturnAttr(pool_alloc_func, Attribute::NonNull);
pool_alloc_func->setAttributes(AttributeList::get(M.getContext(),
alloc_obj_func->getAttributes().getFnAttributes(),
alloc_obj_func->getAttributes().getRetAttributes(),
None));
}
if (!(big_alloc_func = M.getFunction("jl_gc_big_alloc"))) {
std::vector<Type*> args(0);
args.push_back(T_pint8);
args.push_back(T_size);
big_alloc_func = Function::Create(FunctionType::get(T_prjlvalue, args, false),
Function::ExternalLinkage, "jl_gc_big_alloc", &M);
addReturnAttr(big_alloc_func, Attribute::NoAlias);
addReturnAttr(big_alloc_func, Attribute::NonNull);
big_alloc_func->setAttributes(AttributeList::get(M.getContext(),
alloc_obj_func->getAttributes().getFnAttributes(),
alloc_obj_func->getAttributes().getRetAttributes(),
None));
}
auto T_jlvalue = cast<PointerType>(T_prjlvalue)->getElementType();
T_pjlvalue = PointerType::get(T_jlvalue, 0);
Expand Down Expand Up @@ -2053,7 +1969,7 @@ bool LateLowerGCFrame::doInitialization(Module &M) {
j++;
}
if (j != 0)
append_to_compiler_used(M, ArrayRef<GlobalValue*>(function_list, j));
appendToCompilerUsed(M, ArrayRef<GlobalValue*>(function_list, j));
return true;
}

Expand Down
1 change: 1 addition & 0 deletions src/llvm-multiversioning.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -814,6 +814,7 @@ void CloneCtx::fix_inst_uses()
std::tie(id, slot) = get_reloc_slot(orig_f);
Instruction *ptr = new LoadInst(T_pvoidfunc, slot, "", false, insert_before);
ptr->setMetadata(llvm::LLVMContext::MD_tbaa, tbaa_const);
ptr->setMetadata(llvm::LLVMContext::MD_invariant_load, MDNode::get(ctx, None));
ptr = new BitCastInst(ptr, F->getType(), "", insert_before);
use_i->setOperand(info.use->getOperandNo(),
rewrite_inst_use(uses.get_stack(), ptr,
Expand Down
Loading

0 comments on commit d803472

Please sign in to comment.