Skip to content

Commit

Permalink
[wasm] Add limited constant propagation to the jiterpreter for ldc.i4…
Browse files Browse the repository at this point in the history
… and ldloca (#99706)

Right now if an interpreter opcode stores a constant (ldc.i4) or effectively-constant (ldloca) expression into an interpreter local, we have to read it back from memory before using it later in a trace. However, there are many scenarios where it would be profitable to not do this, and instead embed the constant into the trace where the load would otherwise happen. This furthermore enables optimizing out null checks in some cases, since if the address being null-checked is constant, we can determine statically whether it is null and omit the runtime check entirely.
  • Loading branch information
kg authored Mar 21, 2024
1 parent cfe3d2d commit 309185f
Show file tree
Hide file tree
Showing 3 changed files with 117 additions and 16 deletions.
2 changes: 2 additions & 0 deletions src/mono/browser/runtime/jiterpreter-support.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1902,6 +1902,7 @@ export type JiterpreterOptions = {
enableWasmEh: boolean;
enableSimd: boolean;
zeroPageOptimization: boolean;
cprop: boolean;
// For locations where the jiterpreter heuristic says we will be unable to generate
// a trace, insert an entry point opcode anyway. This enables collecting accurate
// stats for options like estimateHeat, but raises overhead.
Expand Down Expand Up @@ -1947,6 +1948,7 @@ const optionNames: { [jsName: string]: string } = {
"enableWasmEh": "jiterpreter-wasm-eh-enabled",
"enableSimd": "jiterpreter-simd-enabled",
"zeroPageOptimization": "jiterpreter-zero-page-optimization",
"cprop": "jiterpreter-constant-propagation",
"enableStats": "jiterpreter-stats-enabled",
"disableHeuristic": "jiterpreter-disable-heuristic",
"estimateHeat": "jiterpreter-estimate-heat",
Expand Down
127 changes: 111 additions & 16 deletions src/mono/browser/runtime/jiterpreter-trace-generator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -119,14 +119,44 @@ function is_backward_branch_target(
return false;
}

interface KnownConstantI32 {
type: "i32";
value: number;
}

interface KnownConstantV128 {
type: "v128";
value: Uint8Array;
}

interface KnownConstantLdloca {
type: "ldloca";
offset: number;
}

type KnownConstant = KnownConstantI32 | KnownConstantV128 | KnownConstantLdloca;
type KnownConstantValue = number | Uint8Array;
const knownConstantValues = new Map<number, KnownConstantValue>();
const knownConstants = new Map<number, KnownConstant>();

function get_known_constant_value(builder: WasmBuilder, localOffset: number): KnownConstantValue | undefined {
function get_known_constant(builder: WasmBuilder, localOffset: number): KnownConstant | undefined {
if (isAddressTaken(builder, localOffset))
return undefined;

return knownConstantValues.get(localOffset);
return knownConstants.get(localOffset);
}

function get_known_constant_value(builder: WasmBuilder, localOffset: number): KnownConstantValue | undefined {
const kc = get_known_constant(builder, localOffset);
if (kc === undefined)
return undefined;

switch (kc.type) {
case "i32":
case "v128":
return kc.value;
}

return undefined;
}

// Perform a quick scan through the opcodes potentially in this trace to build a table of
Expand Down Expand Up @@ -553,11 +583,20 @@ export function generateWasmBody(
builder.local("pLocals");
// locals[ip[1]] = &locals[ip[2]]
const offset = getArgU16(ip, 2),
flag = isAddressTaken(builder, offset);
flag = isAddressTaken(builder, offset),
destOffset = getArgU16(ip, 1);
if (!flag)
mono_log_error(`${traceName}: Expected local ${offset} to have address taken flag`);
append_ldloca(builder, offset);
append_stloc_tail(builder, getArgU16(ip, 1), WasmOpcode.i32_store);
append_stloc_tail(builder, destOffset, WasmOpcode.i32_store);
// Record this ldloca as a known constant so that later uses of it turn into a lea,
// and the wasm runtime can constant fold them with other constants. It's not uncommon
// to have code that does '&x + c', which (if this optimization works) should
// turn into '&locals + offsetof(x) + c' and get constant folded to have the same cost
// as a regular ldloc
knownConstants.set(destOffset, { type: "ldloca", offset: offset });
// dreg invalidation would blow the known constant away, so disable it
skipDregInvalidation = true;
break;
}

Expand Down Expand Up @@ -1712,14 +1751,14 @@ let cknullOffset = -1;
function eraseInferredState() {
cknullOffset = -1;
notNullSince.clear();
knownConstantValues.clear();
knownConstants.clear();
}

function invalidate_local(offset: number) {
if (cknullOffset === offset)
cknullOffset = -1;
notNullSince.delete(offset);
knownConstantValues.delete(offset);
knownConstants.delete(offset);
}

function invalidate_local_range(start: number, bytes: number) {
Expand Down Expand Up @@ -1792,7 +1831,47 @@ function computeMemoryAlignment(offset: number, opcodeOrPrefix: WasmOpcode, simd
return alignment;
}

function try_append_ldloc_cprop(
builder: WasmBuilder, offset: number, opcodeOrPrefix: WasmOpcode,
dryRun: boolean, requireNonzero?: boolean
) {
if (builder.options.cprop && (opcodeOrPrefix === WasmOpcode.i32_load)) {
// It's common to ldc.i4 or ldloca immediately before using the value
// in these cases the known constant analysis will work consistently, and we can skip the extra
// memory load to read the constant we just wrote to a local. the resulting traces should be
// both smaller and faster, while still correct since the ldc still writes to memory
// of course, if known constant analysis is broken, this will break too, but it's better to
// learn immediately whether known constant analysis has been broken this whole time
// at least on x86 this will enable much better native code generation for the trace, since
// operations like memory stores have forms that accept an immediate as rhs
const knownConstant = get_known_constant(builder, offset);
if (knownConstant) {
switch (knownConstant.type) {
case "i32":
if (requireNonzero && (knownConstant.value === 0))
return false;
if (!dryRun)
builder.i32_const(knownConstant.value);
return true;
case "ldloca":
// FIXME: Do we need to invalidate the local again? I don't think we do, we invalidated it
// when the ldloca operation originally happened, and we're just propagating that address
// constant forward to its point of use
// requireNonzero is a no-op since ldloca always produces a nonzero result
if (!dryRun)
append_ldloca(builder, knownConstant.offset, 0);
return true;
}
}
}

return false;
}

function append_ldloc(builder: WasmBuilder, offset: number, opcodeOrPrefix: WasmOpcode, simdOpcode?: WasmSimdOpcode) {
if (try_append_ldloc_cprop(builder, offset, opcodeOrPrefix, false))
return;

builder.local("pLocals");
mono_assert(opcodeOrPrefix >= WasmOpcode.i32_load, () => `Expected load opcode but got ${opcodeOrPrefix}`);
builder.appendU8(opcodeOrPrefix);
Expand Down Expand Up @@ -1828,8 +1907,6 @@ function append_stloc_tail(builder: WasmBuilder, offset: number, opcodeOrPrefix:

// Pass bytesInvalidated=0 if you are reading from the local and the address will never be
// used for writes
// Pass transient=true if the address will not persist after use (so it can't be used to later
// modify the contents of this local)
function append_ldloca(builder: WasmBuilder, localOffset: number, bytesInvalidated?: number) {
if (typeof (bytesInvalidated) !== "number")
bytesInvalidated = 512;
Expand Down Expand Up @@ -1985,9 +2062,9 @@ function emit_ldc(builder: WasmBuilder, ip: MintOpcodePtr, opcode: MintOpcode):
invalidate_local(localOffset);

if (typeof (value) === "number")
knownConstantValues.set(localOffset, value);
knownConstants.set(localOffset, { type: "i32", value: value });
else
knownConstantValues.delete(localOffset);
knownConstants.delete(localOffset);

return true;
}
Expand Down Expand Up @@ -2092,6 +2169,8 @@ function emit_fieldop(
notNullSince.has(objectOffset) &&
!isAddressTaken(builder, objectOffset);

// TODO: Figure out whether this is commonly used to access fields of structs that
// live on the stack, and if so, whether we want to do cprop of the ldloca
if (
(opcode !== MintOpcode.MINT_LDFLDA_UNSAFE) &&
(opcode !== MintOpcode.MINT_STFLD_O)
Expand Down Expand Up @@ -3088,13 +3167,21 @@ function emit_indirectop(builder: WasmBuilder, ip: MintOpcodePtr, opcode: MintOp
return false;
}

append_ldloc_cknull(builder, addressVarIndex, ip, false);
// Check whether ldloc cprop is possible for the address var, if it is, skip doing the ldloc_cknull.
// We'll also skip loading cknull_ptr later.
const addressCprop = try_append_ldloc_cprop(builder, addressVarIndex, WasmOpcode.i32_load, true, true);
if (!addressCprop)
append_ldloc_cknull(builder, addressVarIndex, ip, false);

if (isLoad) {
// pre-load pLocals for the store operation
builder.local("pLocals");
// Load address
builder.local("cknull_ptr");
if (addressCprop)
mono_assert(try_append_ldloc_cprop(builder, addressVarIndex, WasmOpcode.i32_load, false, true), "Unknown jiterpreter cprop failure");
else
builder.local("cknull_ptr");

// For ldind_offset we need to load an offset from another local
// and then add it to the null checked address
if (isAddMul) {
Expand Down Expand Up @@ -3126,13 +3213,21 @@ function emit_indirectop(builder: WasmBuilder, ip: MintOpcodePtr, opcode: MintOp
append_stloc_tail(builder, valueVarIndex, setter);
} else if (opcode === MintOpcode.MINT_STIND_REF) {
// Load destination address
builder.local("cknull_ptr");
if (addressCprop)
mono_assert(try_append_ldloc_cprop(builder, addressVarIndex, WasmOpcode.i32_load, false, true), "Unknown jiterpreter cprop failure");
else
builder.local("cknull_ptr");

// Load address of value so that copy_managed_pointer can grab it
append_ldloca(builder, valueVarIndex, 0);
builder.callImport("copy_ptr");
} else {
// Pre-load address for the store operation
builder.local("cknull_ptr");
if (addressCprop)
mono_assert(try_append_ldloc_cprop(builder, addressVarIndex, WasmOpcode.i32_load, false, true), "Unknown jiterpreter cprop failure");
else
builder.local("cknull_ptr");

// For ldind_offset we need to load an offset from another local
// and then add it to the null checked address
if (isOffset && offsetVarIndex >= 0) {
Expand Down Expand Up @@ -3429,7 +3524,7 @@ function emit_simd(
const view = localHeapViewU8().slice(<any>ip + 4, <any>ip + 4 + sizeOfV128);
builder.v128_const(view);
append_simd_store(builder, ip);
knownConstantValues.set(getArgU16(ip, 1), view);
knownConstants.set(getArgU16(ip, 1), { type: "v128", value: view });
} else {
// dest
append_ldloca(builder, getArgU16(ip, 1), sizeOfV128);
Expand Down
4 changes: 4 additions & 0 deletions src/mono/mono/utils/options-def.h
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,10 @@ DEFINE_BOOL(jiterpreter_backward_branches_enabled, "jiterpreter-backward-branche
DEFINE_BOOL(jiterpreter_enable_simd, "jiterpreter-simd-enabled", TRUE, "Attempt to use WebAssembly SIMD support")
// Since the zero page is unallocated, loading array/string/span lengths from null ptrs will yield zero
DEFINE_BOOL(jiterpreter_zero_page_optimization, "jiterpreter-zero-page-optimization", TRUE, "Exploit the zero page being unallocated to optimize out null checks")
// We can produce higher quality code by embedding known constants directly into traces instead of loading
// the constant from its storage location in the interpreter's locals in memory, even if we can't skip
// the write of the constant into memory.
DEFINE_BOOL(jiterpreter_constant_propagation, "jiterpreter-constant-propagation", TRUE, "Propagate ldc.i4 and ldloca expressions forward to locations where those constants are used")
// When compiling a jit_call wrapper, bypass sharedvt wrappers if possible by inlining their
// logic into the compiled wrapper and calling the target AOTed function with native call convention
DEFINE_BOOL(jiterpreter_direct_jit_call, "jiterpreter-direct-jit-calls", TRUE, "Bypass gsharedvt wrappers when compiling JIT call wrappers")
Expand Down

0 comments on commit 309185f

Please sign in to comment.