Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

optimize constant length memorynew intrinsic (take 2) #56847

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
141 changes: 105 additions & 36 deletions src/cgutils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3321,7 +3321,7 @@ static Value *emit_genericmemoryptr(jl_codectx_t &ctx, Value *mem, const jl_data
LoadInst *LI = ctx.builder.CreateAlignedLoad(PPT, addr, Align(sizeof(char*)));
LI->setOrdering(AtomicOrdering::NotAtomic);
LI->setMetadata(LLVMContext::MD_nonnull, MDNode::get(ctx.builder.getContext(), None));
jl_aliasinfo_t aliasinfo = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_const);
jl_aliasinfo_t aliasinfo = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_memoryptr);
aliasinfo.decorateInst(LI);
Value *ptr = LI;
if (AS) {
Expand All @@ -3347,7 +3347,7 @@ static Value *emit_genericmemoryowner(jl_codectx_t &ctx, Value *t)
return emit_guarded_test(ctx, foreign, t, [&] {
addr = ctx.builder.CreateConstInBoundsGEP1_32(ctx.types().T_jlgenericmemory, m, 1);
LoadInst *owner = ctx.builder.CreateAlignedLoad(ctx.types().T_prjlvalue, addr, Align(sizeof(void*)));
jl_aliasinfo_t ai = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_const);
jl_aliasinfo_t ai = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_memoryptr);
ai.decorateInst(owner);
return ctx.builder.CreateSelect(ctx.builder.CreateIsNull(owner), t, owner);
});
Expand Down Expand Up @@ -4432,6 +4432,105 @@ static int compare_cgparams(const jl_cgparams_t *a, const jl_cgparams_t *b)
}
#endif

static auto *emit_genericmemory_unchecked(jl_codectx_t &ctx, Value *cg_nbytes, Value *cg_typ)
{
auto ptls = get_current_ptls(ctx);
auto call = prepare_call(jl_alloc_genericmemory_unchecked_func);
auto *alloc = ctx.builder.CreateCall(call, { ptls, cg_nbytes, cg_typ});
alloc->setAttributes(call->getAttributes());
alloc->addRetAttr(Attribute::getWithAlignment(alloc->getContext(), Align(JL_HEAP_ALIGNMENT)));
call->addRetAttr(Attribute::getWithDereferenceableBytes(call->getContext(), sizeof(jl_genericmemory_t)));
return alloc;
}

static void emit_memory_zeroinit_and_stores(jl_codectx_t &ctx, jl_datatype_t *typ, Value* alloc, Value* nbytes, Value* nel, int zi)
{
auto arg_typename = [&] JL_NOTSAFEPOINT {
std::string type_str;
auto eltype = jl_tparam1(typ);
if (jl_is_datatype(eltype))
type_str = jl_symbol_name(((jl_datatype_t*)eltype)->name->name);
else if (jl_is_uniontype(eltype))
type_str = "Union";
else
type_str = "<unknown type>";
return "Memory{" + type_str + "}[]";
};
setName(ctx.emission_context, alloc, arg_typename);
// set length (jl_alloc_genericmemory_unchecked_func doesn't have it)
Value *decay_alloc = decay_derived(ctx, alloc);
Value *len_field = ctx.builder.CreateStructGEP(ctx.types().T_jlgenericmemory, decay_alloc, 0);
auto len_store = ctx.builder.CreateAlignedStore(nel, len_field, Align(sizeof(void*)));
auto aliasinfo = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_memorylen);
aliasinfo.decorateInst(len_store);
//This avoids the length store from being deleted which is illegal
ctx.builder.CreateFence(AtomicOrdering::Release, SyncScope::SingleThread);
// zeroinit pointers and unions
if (zi) {
Value *memory_ptr = ctx.builder.CreateStructGEP(ctx.types().T_jlgenericmemory, decay_alloc, 1);
auto *load = ctx.builder.CreateAlignedLoad(ctx.types().T_ptr, memory_ptr, Align(sizeof(void*)));
aliasinfo = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_memoryptr);
aliasinfo.decorateInst(load);
auto int8t = getInt8Ty(ctx.builder.getContext());
ctx.builder.CreateMemSet(load, ConstantInt::get(int8t, 0), nbytes, Align(sizeof(void*)));
}
return;
}


static jl_cgval_t emit_const_len_memorynew(jl_codectx_t &ctx, jl_datatype_t *typ, size_t nel, jl_genericmemory_t *inst)
{
if (nel == 0) {
Value *empty_alloc = track_pjlvalue(ctx, literal_pointer_val(ctx, (jl_value_t*)inst));
return mark_julia_type(ctx, empty_alloc, true, typ);
}
const jl_datatype_layout_t *layout = ((jl_datatype_t*)typ)->layout;
assert(((jl_datatype_t*)typ)->has_concrete_subtype && layout != NULL);
size_t elsz = layout->size;
int isboxed = layout->flags.arrayelem_isboxed;
int isunion = layout->flags.arrayelem_isunion;
int zi = ((jl_datatype_t*)typ)->zeroinit;
if (isboxed)
elsz = sizeof(void*);
size_t nbytes;
bool overflow = __builtin_mul_overflow(nel, elsz, &nbytes);
if (isunion) {
// an extra byte for each isbits union memory element, stored at m->ptr + m->length
overflow |= __builtin_add_overflow(nbytes, nel, &nbytes);
}
// overflow if signed size is too big or nel is too big (the latter matters iff elsz==0)
ssize_t tmp=1;
overflow |= __builtin_add_overflow(nel, 1, &tmp) || __builtin_add_overflow(nbytes, 1, &tmp);
if (overflow)
emit_error(ctx, prepare_call(jlargumenterror_func), "invalid GenericMemory size: the number of elements is either negative or too large for system address width");

auto T_size = ctx.types().T_size;
auto cg_typ = literal_pointer_val(ctx, (jl_value_t*) typ);
auto cg_nbytes = ConstantInt::get(T_size, nbytes);
auto cg_nel = ConstantInt::get(T_size, nel);
size_t tot = nbytes + LLT_ALIGN(sizeof(jl_genericmemory_t),JL_SMALL_BYTE_ALIGNMENT);
// if allocation fits within GC pools
int pooled = tot <= GC_MAX_SZCLASS;
Value *alloc, *decay_alloc, *memory_ptr;
jl_aliasinfo_t aliasinfo;
if (pooled) {
alloc = emit_allocobj(ctx, tot, cg_typ, false, JL_SMALL_BYTE_ALIGNMENT);
decay_alloc = decay_derived(ctx, alloc);
memory_ptr = ctx.builder.CreateStructGEP(ctx.types().T_jlgenericmemory, decay_alloc, 1);
setName(ctx.emission_context, memory_ptr, "memory_ptr");
auto objref = emit_pointer_from_objref(ctx, alloc);
Value *memory_data = emit_ptrgep(ctx, objref, JL_SMALL_BYTE_ALIGNMENT);
auto *store = ctx.builder.CreateAlignedStore(memory_data, memory_ptr, Align(sizeof(void*)));
aliasinfo = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_memoryptr);
aliasinfo.decorateInst(store);
setName(ctx.emission_context, memory_data, "memory_data");
} else { // just use the dynamic length version since the malloc will be slow anyway
alloc = emit_genericmemory_unchecked(ctx, cg_nbytes, cg_typ);
}
emit_memory_zeroinit_and_stores(ctx, typ, alloc, cg_nbytes, cg_nel, zi);
return mark_julia_type(ctx, alloc, true, typ);
}

static jl_cgval_t emit_memorynew(jl_codectx_t &ctx, jl_datatype_t *typ, jl_cgval_t nel, jl_genericmemory_t *inst)
{
emit_typecheck(ctx, nel, (jl_value_t*)jl_long_type, "memorynew");
Expand All @@ -4448,9 +4547,7 @@ static jl_cgval_t emit_memorynew(jl_codectx_t &ctx, jl_datatype_t *typ, jl_cgval
if (isboxed)
elsz = sizeof(void*);

auto ptls = get_current_ptls(ctx);
auto T_size = ctx.types().T_size;
auto int8t = getInt8Ty(ctx.builder.getContext());
BasicBlock *emptymemBB, *nonemptymemBB, *retvalBB;
emptymemBB = BasicBlock::Create(ctx.builder.getContext(), "emptymem");
nonemptymemBB = BasicBlock::Create(ctx.builder.getContext(), "nonemptymem");
Expand All @@ -4466,18 +4563,7 @@ static jl_cgval_t emit_memorynew(jl_codectx_t &ctx, jl_datatype_t *typ, jl_cgval
ctx.builder.CreateBr(retvalBB);
nonemptymemBB->insertInto(ctx.f);
ctx.builder.SetInsertPoint(nonemptymemBB);
// else actually allocate mem
auto arg_typename = [&] JL_NOTSAFEPOINT {
std::string type_str;
auto eltype = jl_tparam1(typ);
if (jl_is_datatype(eltype))
type_str = jl_symbol_name(((jl_datatype_t*)eltype)->name->name);
else if (jl_is_uniontype(eltype))
type_str = "Union";
else
type_str = "<unknown type>";
return "Memory{" + type_str + "}[]";
};

auto cg_typ = literal_pointer_val(ctx, (jl_value_t*) typ);
auto cg_elsz = ConstantInt::get(T_size, elsz);

Expand All @@ -4501,27 +4587,10 @@ static jl_cgval_t emit_memorynew(jl_codectx_t &ctx, jl_datatype_t *typ, jl_cgval
overflow = ctx.builder.CreateOr(overflow, tobignel);
Value *notoverflow = ctx.builder.CreateNot(overflow);
error_unless(ctx, prepare_call(jlargumenterror_func), notoverflow, "invalid GenericMemory size: the number of elements is either negative or too large for system address width");
// actually allocate
auto call = prepare_call(jl_alloc_genericmemory_unchecked_func);
Value *alloc = ctx.builder.CreateCall(call, { ptls, nbytes, cg_typ});
// set length (jl_alloc_genericmemory_unchecked_func doesn't have it)
Value *decay_alloc = decay_derived(ctx, alloc);
Value *len_field = ctx.builder.CreateStructGEP(ctx.types().T_jlgenericmemory, decay_alloc, 0);
auto len_store = ctx.builder.CreateAlignedStore(nel_unboxed, len_field, Align(sizeof(void*)));
auto aliasinfo = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_memorylen);
aliasinfo.decorateInst(len_store);
//This avoids the length store from being deleted which is illegal
ctx.builder.CreateFence(AtomicOrdering::Release, SyncScope::SingleThread);
// zeroinit pointers and unions
if (zi) {
Value *memory_ptr = ctx.builder.CreateStructGEP(ctx.types().T_jlgenericmemory, decay_alloc, 1);
auto *load = ctx.builder.CreateAlignedLoad(ctx.types().T_ptr, memory_ptr, Align(sizeof(void*)));
aliasinfo = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_memoryptr);
aliasinfo.decorateInst(load);
ctx.builder.CreateMemSet(load, ConstantInt::get(int8t, 0), nbytes, Align(sizeof(void*)));
}
// actually allocate the memory

setName(ctx.emission_context, alloc, arg_typename);
Value *alloc = emit_genericmemory_unchecked(ctx, nbytes, cg_typ);
emit_memory_zeroinit_and_stores(ctx, typ, alloc, nbytes, nel_unboxed, zi);
ctx.builder.CreateBr(retvalBB);
nonemptymemBB = ctx.builder.GetInsertBlock();
// phi node to choose which side of branch
Expand Down
23 changes: 19 additions & 4 deletions src/codegen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -408,9 +408,9 @@ struct jl_tbaacache_t {
tbaa_arrayptr = tbaa_make_child(mbuilder, "jtbaa_arrayptr", tbaa_array_scalar).first;
tbaa_arraysize = tbaa_make_child(mbuilder, "jtbaa_arraysize", tbaa_array_scalar).first;
tbaa_arrayselbyte = tbaa_make_child(mbuilder, "jtbaa_arrayselbyte", tbaa_array_scalar).first;
tbaa_memoryptr = tbaa_make_child(mbuilder, "jtbaa_memoryptr", tbaa_array_scalar, true).first;
tbaa_memorylen = tbaa_make_child(mbuilder, "jtbaa_memorylen", tbaa_array_scalar, true).first;
tbaa_memoryown = tbaa_make_child(mbuilder, "jtbaa_memoryown", tbaa_array_scalar, true).first;
tbaa_memoryptr = tbaa_make_child(mbuilder, "jtbaa_memoryptr", tbaa_array_scalar).first;
tbaa_memorylen = tbaa_make_child(mbuilder, "jtbaa_memorylen", tbaa_array_scalar).first;
tbaa_memoryown = tbaa_make_child(mbuilder, "jtbaa_memoryown", tbaa_array_scalar).first;
tbaa_const = tbaa_make_child(mbuilder, "jtbaa_const", nullptr, true).first;
}
};
Expand Down Expand Up @@ -1179,6 +1179,7 @@ static const auto jl_alloc_genericmemory_unchecked_func = new JuliaFunction<Type
},
[](LLVMContext &C) {
auto FnAttrs = AttrBuilder(C);
FnAttrs.addAllocKindAttr(AllocFnKind::Alloc);
FnAttrs.addMemoryAttr(MemoryEffects::argMemOnly(ModRefInfo::Ref) | MemoryEffects::inaccessibleMemOnly(ModRefInfo::ModRef));
FnAttrs.addAttribute(Attribute::WillReturn);
FnAttrs.addAttribute(Attribute::NoUnwind);
Expand Down Expand Up @@ -1499,6 +1500,10 @@ static const auto pointer_from_objref_func = new JuliaFunction<>{
AttrBuilder FnAttrs(C);
FnAttrs.addMemoryAttr(MemoryEffects::none());
FnAttrs.addAttribute(Attribute::NoUnwind);
FnAttrs.addAttribute(Attribute::Speculatable);
FnAttrs.addAttribute(Attribute::WillReturn);
FnAttrs.addAttribute(Attribute::NoRecurse);
FnAttrs.addAttribute(Attribute::NoSync);
return AttributeList::get(C,
AttributeSet::get(C, FnAttrs),
Attributes(C, {Attribute::NonNull}),
Expand Down Expand Up @@ -4470,7 +4475,17 @@ static bool emit_builtin_call(jl_codectx_t &ctx, jl_cgval_t *ret, jl_value_t *f,
jl_genericmemory_t *inst = (jl_genericmemory_t*)((jl_datatype_t*)typ)->instance;
if (inst == NULL)
return false;
*ret = emit_memorynew(ctx, typ, argv[2], inst);
if (argv[2].constant) {
if (!jl_is_long(argv[2].constant))
return false;
size_t nel = jl_unbox_long(argv[2].constant);
if (nel < 0)
return false;
*ret = emit_const_len_memorynew(ctx, typ, nel, inst);
}
else {
*ret = emit_memorynew(ctx, typ, argv[2], inst);
}
return true;
}

Expand Down
4 changes: 2 additions & 2 deletions src/llvm-alloc-helpers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -250,8 +250,8 @@ void jl_alloc::runEscapeAnalysis(llvm::CallInst *I, EscapeAnalysisRequiredArgs r
return true;
}
if (required.pass.gc_loaded_func == callee) {
required.use_info.haspreserve = true;
required.use_info.hasload = true;
// TODO add manual load->store forwarding
push_inst(inst);
return true;
}
if (required.pass.typeof_func == callee) {
Expand Down
10 changes: 5 additions & 5 deletions src/llvm-alloc-opt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -752,11 +752,11 @@ void Optimizer::moveToStack(CallInst *orig_inst, size_t sz, bool has_ref, AllocF
call->eraseFromParent();
return;
}
//if (pass.gc_loaded_func == callee) {
// call->replaceAllUsesWith(new_i);
// call->eraseFromParent();
// return;
//}
if (pass.gc_loaded_func == callee) {
// TODO: handle data pointer forwarding, length forwarding, and fence removal
user->replaceUsesOfWith(orig_i, Constant::getNullValue(orig_i->getType()));
return;
}
if (pass.typeof_func == callee) {
++RemovedTypeofs;
call->replaceAllUsesWith(tag);
Expand Down
1 change: 1 addition & 0 deletions src/pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -527,6 +527,7 @@ static void buildIntrinsicLoweringPipeline(ModulePassManager &MPM, PassBuilder *
JULIA_PASS(FPM.addPass(LateLowerGCPass()));
JULIA_PASS(FPM.addPass(FinalLowerGCPass()));
if (O.getSpeedupLevel() >= 2) {
FPM.addPass(DSEPass());
FPM.addPass(GVNPass());
FPM.addPass(SCCPPass());
FPM.addPass(DCEPass());
Expand Down
50 changes: 50 additions & 0 deletions test/boundscheck_exec.jl
Original file line number Diff line number Diff line change
Expand Up @@ -297,4 +297,54 @@ end
typeintersect(Int, Integer)
end |> only === Type{Int}

if bc_opt == bc_default
@testset "Array/Memory escape analysis" begin
function no_allocate(T::Type{<:Union{Memory, Vector}})
v = T(undef, 2)
v[1] = 2
v[2] = 3
return v[1] + v[2]
end
function test_alloc(::Type{T}; broken=false) where T
@test (@allocated no_allocate(T)) == 0 broken=broken
end
@testset "$T" for T in [Memory, Vector]
@testset "$ET" for ET in [Int, Float32, Union{Int, Float64}]
no_allocate(T{ET}) #compile
# allocations aren't removed for Union eltypes which they theoretically could be eventually
test_alloc(T{ET}, broken=(ET==Union{Int, Float64}))
end
end
function f() # this was causing a bug on an in progress version of #55913.
m = Memory{Float64}(undef, 4)
m .= 1.0
s = 0.0
for x ∈ m
s += x
end
s
end
@test f() === 4.0
function confuse_alias_analysis()
mem0 = Memory{Int}(undef, 1)
mem1 = Memory{Int}(undef, 1)
@inbounds mem0[1] = 3
for width in 1:2
@inbounds mem1[1] = mem0[1]
mem0 = mem1
end
mem0[1]
end
@test confuse_alias_analysis() == 3
@test (@allocated confuse_alias_analysis()) == 0
function no_alias_prove(n)
m1 = Memory{Int}(undef,n)
m2 = Memory{Int}(undef,n)
m1 === m2
end
no_alias_prove(1)
@test_broken (@allocated no_alias_prove(5)) == 0
end
end

end
Loading