From 674fd78849578f6a205b1f6cc293ac2ab42775cf Mon Sep 17 00:00:00 2001 From: Tom An Date: Sun, 24 Nov 2024 20:42:02 -0500 Subject: [PATCH 01/18] inline tier --- src/engine/compiler/SinglePassCompiler.v3 | 318 +++++++++++++++++----- 1 file changed, 253 insertions(+), 65 deletions(-) diff --git a/src/engine/compiler/SinglePassCompiler.v3 b/src/engine/compiler/SinglePassCompiler.v3 index bff1f7cc..9c953556 100644 --- a/src/engine/compiler/SinglePassCompiler.v3 +++ b/src/engine/compiler/SinglePassCompiler.v3 @@ -88,7 +88,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl def state = SpcState.new(regAlloc); // Other state def trap_labels = Vector<(TrapReason, MasmLabel)>.new(); - var start_pos = 0; + //var start_pos = 0; var module: Module; var func: FuncDecl; var sig: SigDecl; @@ -103,6 +103,15 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl var last_probe = 0; var skip_to_end: bool; + // when function is inlined, we continue using caller's abstract state, and + // push callee's params/locals as needed, thus we need to track the base sp of the locals + // in the current context. + var local_base_sp: u31 = 0; + // certain inlined functions need access to their instance, which might be different, so we + // simply store the callee instance address on the abstract state, and mark the slot. If the value is + // 0, we use the current instance whenever needed. + var inlined_instance_slot = -1; + new() { masm.unimplemented = unsupported; masm.newTrapLabel = newTrapLabel; // trap labels are per-pc @@ -159,6 +168,11 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl last_probe = 0; masm.source_loc = it.pc; it.dispatch(this); + if (Trace.compiler) { + OUT.puts("JIT code: "); + masm.printCodeBytes(OUT); + OUT.ln(); + } unrefRegs(); if (Debug.compiler) checkRegAlloc(); it.next(); @@ -352,7 +366,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl masm.emit_debugger_breakpoint(); return; } - x: WhammProbe => if (SpcTuning.intrinsifyWhammProbe && WasmFunction.?(x.func)) { + x: WhammProbe => if (SpcTuning.intrinsifyWhammProbe && WasmFunction.?(x.func)){ emitWhammProbe(x); return; } @@ -375,12 +389,46 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl // saves the overhead of using a runtime call by directly invoking the wasm function associated with the whamm probe def emitWhammProbe(probe: WhammProbe) { - // spill entire value stack. - state.emitSaveAll(resolver, probeSpillMode); // set up args and push to frame slots. var whamm_sig = probe.sig; var offsets = masm.getOffsets(); + var inline_config = InlineConfig(false, false, false); + var new_local_base_sp = 0; + var orig_sp = state.sp; var callee_func = WasmFunction.!(probe.func); + + if (SpcTuning.inlineSmallFunc) { + // TODO: can reuse when implementing inlining for SPC + inline_config = InlineConfig(probe.spc_swap_membase, probe.spc_swap_instance, probe.spc_inline_func); + if (!probe.inline_heuristic_checked) { + inline_config = funcCanInline(callee_func.decl); + probe.inline_heuristic_checked = true; + probe.spc_swap_instance = inline_config.swap_instance; + probe.spc_swap_membase = inline_config.swap_membase; + } + + if (inline_config.swap_instance) { // push whamm instance onto abstract stack directly + var whamm_instance_addr = Pointer.atObject(callee_func.instance) - Pointer.NULL; + var slot_addr = masm.slotAddr(state.sp); + inlined_instance_slot = int.view(state.sp); + state.push(KIND_REF | IS_STORED, NO_REG, 0); + masm.emit_mov_m_l(slot_addr, whamm_instance_addr); + } + + // overwrite mem0_base with whamm instance's memory base, restore from frame slot later + if (inline_config.swap_membase) { + var memobj_addr = Pointer.atObject(callee_func.instance.memories[0]) - Pointer.NULL; + masm.emit_mov_r_l(regs.mem0_base, i64.view(memobj_addr)); + masm.emit_read_v3_mem_base(regs.mem0_base, regs.mem0_base); + } + } + + if (!inline_config.can_inline) { + state.emitSaveAll(resolver, probeSpillMode); + } else { + new_local_base_sp = int.view(state.sp); + } + for (i < whamm_sig.length) { var slot_addr = masm.slotAddr(state.sp + u32.view(i)); match(whamm_sig[i]) { @@ -392,59 +440,166 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl masm.emit_br_r(regs.scratch, MasmBrCond.REF_NONNULL, cont_label); // special case: requires runtime call to materialize FrameAccessor object + if (inline_config.can_inline) state.emitSaveAll(resolver, probeSpillMode); // spill entire value stack. masm.emit_call_runtime_materialize_frame_accessor(); masm.emit_mov_r_m(ValueKind.REF, regs.scratch, frame.accessor_slot); + emit_reload_regs(); + if (inline_config.can_inline && !probeSpillMode.free_regs) state.emitRestoreAll(resolver); - // move result to mem slot + // move result to mem slot or reg, depending on inlining masm.bindLabel(cont_label); - masm.emit_mov_m_m(ValueKind.REF, slot_addr, MasmAddr(regs.scratch, offsets.X86_64FrameAccessor_metaRef)); + if (inline_config.can_inline) { + var reg = allocRegTos(ValueKind.REF); + masm.emit_mov_r_m(ValueKind.REF, reg, MasmAddr(regs.scratch, offsets.X86_64FrameAccessor_metaRef)); + state.push(KIND_REF | IN_REG, reg, 0); + } else { + masm.emit_mov_m_m(ValueKind.REF, slot_addr, MasmAddr(regs.scratch, offsets.X86_64FrameAccessor_metaRef)); + } } Val(val) => { var is_v128 = false; var low: u64, high: u64; match (val) { - I31(v) => low = v; - I32(v) => low = v; - I64(v) => low = v; - F32(v) => low = v; - F64(v) => low = v; + I31(v) => { + if (inline_config.can_inline) state.push(KIND_I32 | IS_CONST, NO_REG, i32.view(v)); + low = v; + } + I32(v) => { + low = v; + if (inline_config.can_inline) state.push(KIND_I32 | IS_CONST, NO_REG, i32.view(v)); + } + I64(v) => { + low = v; + if (inline_config.can_inline) { + var reg = allocRegTos(ValueKind.I64); + masm.emit_mov_r_l(reg, i64.view(v)); + state.push(KIND_I64 | IN_REG, reg, 0); + } + } + F32(v) => { + low = v; + if (inline_config.can_inline) { + var reg = allocRegTos(ValueKind.F32); + masm.emit_mov_r_f32(reg, v); + state.push(KIND_F32 | IN_REG, reg, 0); + } + } + F64(v) => { + low = v; + if (inline_config.can_inline) { + var reg = allocRegTos(ValueKind.F64); + masm.emit_mov_r_d64(reg, v); + state.push(KIND_F64 | IN_REG, reg, 0); + } + } V128(l, h) => { low = l; high = h; is_v128 = true; + if (inline_config.can_inline) { + var reg = allocRegTos(ValueKind.V128); + masm.emit_mov_r_q(reg, low, high); + state.push(KIND_V128 | IN_REG, reg, 0); + } + } + Ref(v) => { + low = u64.view(Pointer.atObject(v) - Pointer.NULL); + if (inline_config.can_inline) { + var reg = allocRegTos(ValueKind.REF); + masm.emit_mov_r_l(reg, i64.view(low)); + state.push(KIND_REF | IN_REG, reg, 0); + } } - Ref(val) => low = u64.view(Pointer.atObject(val) - Pointer.NULL); } - masm.emit_mov_m_d(slot_addr, low); - if (is_v128) { - masm.emit_mov_m_d(slot_addr.plus(8), high); + if (!inline_config.can_inline) { + masm.emit_mov_m_d(slot_addr, low); + if (is_v128) { + masm.emit_mov_m_d(slot_addr.plus(8), high); + } } } Operand(_, i) => { - masm.emit_mov_m_m(ValueKind.REF, slot_addr, masm.slotAddr(state.sp + u32.view(i) - 1)); + var index = orig_sp + u32.view(i); + if (inline_config.can_inline) { + visit_LOCAL_GET(u31.view(index)); + } else { + masm.emit_mov_m_m(ValueKind.REF, slot_addr, masm.slotAddr(index)); + } } Local(_, i) => { - masm.emit_mov_m_m(ValueKind.REF, slot_addr, masm.slotAddr(u32.view(i))); + if (inline_config.can_inline) { + visit_LOCAL_GET(u31.view(i)); + } else { + masm.emit_mov_m_m(ValueKind.REF, slot_addr, masm.slotAddr(u32.view(i))); + } } } } var whamm_instance = callee_func.instance; var func_id = callee_func.decl.func_index; + var whamm_module = whamm_instance.module; + var whamm_func_decl = callee_func.decl; + if (inline_config.can_inline) { + var orig_decl = it.func; + var orig_pc = it.pc; + var orig_module = module; + var orig_sig = sig; + + // prepare spc for inlining + this.local_base_sp = u31.view(new_local_base_sp); + this.module = whamm_module; + this.func = whamm_func_decl; + this.sig = whamm_func_decl.sig; + + // inline codegen + it.reset(this.func); + it.dispatchLocalDecls(this); + if (Trace.compiler) Trace.OUT.puts("Start compiling inlined whamm probe").ln(); + while (it.more() && success) { + if (Trace.compiler) traceOpcodeAndStack(false); + last_probe = 0; + masm.source_loc = it.pc; + it.dispatch(this); + if (Trace.compiler) { + OUT.puts("JIT code: "); + masm.printCodeBytes(OUT); + OUT.ln(); + } + unrefRegs(); + if (Debug.compiler) checkRegAlloc(); + it.next(); + } + if (Trace.compiler) Trace.OUT.puts("Finished compiling inlined whamm probe").ln(); + + // restore spc after inlining + it.reset(orig_decl).at(orig_pc); + this.local_base_sp = 0; + this.inlined_instance_slot = -1; + this.module = orig_module; + this.func = orig_decl; + this.sig = orig_sig; + if (inline_config.swap_membase) { + masm.emit_mov_r_m(ValueKind.REF, regs.mem0_base, frame.mem0_base_slot); + } - var vsp_reg = allocTmpFixed(ValueKind.REF, regs.vsp); - var func_reg = allocTmpFixed(ValueKind.REF, regs.func_arg); - var tmp = allocTmp(ValueKind.REF); - - // Load the target code/entrypoint. - masm.emit_mov_r_l(func_reg, Pointer.atObject(whamm_instance.functions[func_id]) - Pointer.NULL); - masm.emit_mov_r_m(ValueKind.REF, tmp, MasmAddr(func_reg, offsets.WasmFunction_decl)); - masm.emit_mov_r_m(ValueKind.REF, tmp, MasmAddr(tmp, offsets.FuncDecl_target_code)); - // adjust vsp_reg to compute the "true" VSP, accounting for args to WhammProbe's WasmFunction - emit_compute_vsp(vsp_reg, state.sp + u32.view(whamm_sig.length)); - // Call to the entrypoint. - masm.emit_call_r(tmp); - emit_reload_regs(); - if (!probeSpillMode.free_regs) state.emitRestoreAll(resolver); + // clear callee params/locals from abstract state + dropN(state.sp - orig_sp); + } else { + var vsp_reg = allocTmpFixed(ValueKind.REF, regs.vsp); + var func_reg = allocTmpFixed(ValueKind.REF, regs.func_arg); + var tmp = allocTmp(ValueKind.REF); + + // Load the target code/entrypoint. + masm.emit_mov_r_l(func_reg, Pointer.atObject(whamm_instance.functions[func_id]) - Pointer.NULL); + masm.emit_mov_r_m(ValueKind.REF, tmp, MasmAddr(func_reg, offsets.WasmFunction_decl)); + masm.emit_mov_r_m(ValueKind.REF, tmp, MasmAddr(tmp, offsets.FuncDecl_target_code)); + // adjust vsp_reg to compute the "true" VSP, accounting for args to WhammProbe's WasmFunction + emit_compute_vsp(vsp_reg, state.sp + u32.view(whamm_sig.length)); + // Call to the entrypoint. + masm.emit_call_r(tmp); + emit_reload_regs(); + if (!probeSpillMode.free_regs) state.emitRestoreAll(resolver); + } } def visit_CRASH_EXEC() { @@ -508,35 +663,37 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl setUnreachable(); } def visit_END() { - var ctl_top = state.ctl_stack.peek(); - if (ctl_top.opcode == Opcode.LOOP.code) { - state.ctl_stack.pop(); - if (!ctl_top.reachable) setUnreachable(); - } else if (ctl_top.opcode == Opcode.IF.code) { - // simulate empty if-true block - state.emitFallthru(resolver); - masm.emit_br(ctl_top.label); - masm.bindLabel(ctl_top.else_label); - state.doElse(); - ctl_top.opcode = Opcode.ELSE.code; - state.emitFallthru(resolver); - masm.bindLabel(ctl_top.label); - state.resetToMerge(ctl_top); - state.ctl_stack.pop(); - } else if (ctl_top.opcode == Opcode.BLOCK.code || ctl_top.opcode == Opcode.ELSE.code) { - state.emitFallthru(resolver); - masm.bindLabel(ctl_top.label); - state.resetToMerge(ctl_top); - state.ctl_stack.pop(); - } else if (ctl_top.opcode == Opcode.RETURN.code) { - state.emitFallthru(resolver); - masm.bindLabel(ctl_top.label); - state.resetToMerge(ctl_top); + if (this.local_base_sp == 0) { + var ctl_top = state.ctl_stack.peek(); + if (ctl_top.opcode == Opcode.LOOP.code) { + state.ctl_stack.pop(); + if (!ctl_top.reachable) setUnreachable(); + } else if (ctl_top.opcode == Opcode.IF.code) { + // simulate empty if-true block + state.emitFallthru(resolver); + masm.emit_br(ctl_top.label); + masm.bindLabel(ctl_top.else_label); + state.doElse(); + ctl_top.opcode = Opcode.ELSE.code; + state.emitFallthru(resolver); + masm.bindLabel(ctl_top.label); + state.resetToMerge(ctl_top); + state.ctl_stack.pop(); + } else if (ctl_top.opcode == Opcode.BLOCK.code || ctl_top.opcode == Opcode.ELSE.code) { + state.emitFallthru(resolver); + masm.bindLabel(ctl_top.label); + state.resetToMerge(ctl_top); + state.ctl_stack.pop(); + } else if (ctl_top.opcode == Opcode.RETURN.code) { + state.emitFallthru(resolver); + masm.bindLabel(ctl_top.label); + state.resetToMerge(ctl_top); + emitProbe(); + if (ctl_top.merge_count > 1) emitReturn(ctl_top); + state.ctl_stack.pop(); + } emitProbe(); - if (ctl_top.merge_count > 1) emitReturn(ctl_top); - state.ctl_stack.pop(); } - emitProbe(); } def visit_BR(depth: u31) { var target = state.getControl(depth); @@ -812,6 +969,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl dropN(valcount); } def visit_LOCAL_GET(index: u31) { + index = index + local_base_sp; var lv = state.get(index); if (lv.inReg()) { regAlloc.assign(lv.reg, int.!(state.sp)); @@ -829,6 +987,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl } } def visit_LOCAL_SET(index: u31) { + index = index + local_base_sp; var lv = state.get(index); var sv = state.pop(); if (sv.inReg()) { @@ -853,6 +1012,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl } } def visit_LOCAL_TEE(index: u31) { + index = index + local_base_sp; var lv = state.get(index); regAlloc.unassign(lv.reg, index); // unref existing register var sv = state.peek(); @@ -943,7 +1103,6 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl if (i32.view(val) == val) { state.push(KIND_I64 | IS_CONST, NO_REG, i32.view(val)); } else { - var tos = state.sp; var reg = allocRegTos(ValueKind.I64); masm.emit_mov_r_l(reg, val); state.push(KIND_I64 | IN_REG, reg, 0); @@ -1304,7 +1463,11 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl } } def emit_load_instance(reg: Reg) { - masm.emit_mov_r_m(ValueKind.REF, reg, frame.instance_slot); + var instance_addr = frame.instance_slot; + if (inlined_instance_slot >= 0) { + instance_addr = masm.slotAddr(u32.view(inlined_instance_slot)); + } + masm.emit_mov_r_m(ValueKind.REF, reg, instance_addr); } def emitLoad(kind: ValueKind, imm: MemArg, meth: (ValueKind, Reg, Reg, Reg, u32) -> ()) { var base_reg = regs.mem0_base; @@ -1534,11 +1697,8 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl } def traceOpcode(orig: bool) { OUT.flush(); - var pc = it.pc - start_pos; instrTracer.instr_width = Opcodes.longestName + 1; - instrTracer.putPcAndInstr(OUT, module, func, pc, orig); - OUT.puts("JIT code: "); - masm.printCodeBytes(OUT); + instrTracer.putPcAndInstr(OUT, module, func, it.pc, orig); OUT.ln(); } } @@ -2184,4 +2344,32 @@ class MoveNode { var src: MoveNode; // source of the value for this node var dstList: MoveNode; // head of destination list var dstNext: MoveNode; // next in a list of successors -} \ No newline at end of file +} + +// checks function bytecode to see if it can be inlined based on +// simple heuristics: length <= 50 and straightline code. +def funcCanInline(decl: FuncDecl) -> InlineConfig { + var default = InlineConfig(false, false, false); + if (decl.orig_bytecode.length > 50 || decl.sig.params.length > 10) return default; + var bi = BytecodeIterator.new().reset(decl); + var swap_instance = false; + var swap_membase = false; + while (bi.more()) { + var op = bi.current(); + match (op) { + IF, BR, BR_IF, BR_TABLE, BR_ON_NULL, BR_ON_NON_NULL, BR_ON_CAST, BR_ON_CAST_FAIL, RETURN => return default; + THROW, CALL, CALL_INDIRECT, MEMORY_INIT, MEMORY_SIZE, MEMORY_GROW, MEMORY_COPY, MEMORY_FILL, REF_FUNC, DATA_DROP, + ELEM_DROP, TABLE_INIT, TABLE_SIZE, TABLE_COPY, TABLE_GROW, GLOBAL_SET, GLOBAL_GET, TABLE_SET, TABLE_GET => swap_instance = true; + I32_STORE, I64_STORE, F32_STORE, F64_STORE, I32_STORE8, I32_STORE16, I64_STORE8, I64_STORE16, I64_STORE32, + V128_STORE, I32_LOAD, I64_LOAD, F32_LOAD, F64_LOAD, I32_LOAD8_S, I32_LOAD8_U, I32_LOAD16_S, I32_LOAD16_U, + I64_LOAD8_S, I64_LOAD8_U, I64_LOAD16_S, I64_LOAD16_U, I64_LOAD32_S, I64_LOAD32_U, V128_LOAD => { + swap_membase = true; + } + _ => ; + } + bi.next(); + } + return InlineConfig(swap_membase, swap_instance, true); +} + +type InlineConfig(swap_membase: bool, swap_instance: bool, can_inline: bool); From 44c428122dd44d00c3fd1e2a358ef997a6f5d148 Mon Sep 17 00:00:00 2001 From: Tom An Date: Sun, 24 Nov 2024 22:22:49 -0500 Subject: [PATCH 02/18] fix get operand bug --- src/engine/compiler/SinglePassCompiler.v3 | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/engine/compiler/SinglePassCompiler.v3 b/src/engine/compiler/SinglePassCompiler.v3 index 9c953556..ae33b6b1 100644 --- a/src/engine/compiler/SinglePassCompiler.v3 +++ b/src/engine/compiler/SinglePassCompiler.v3 @@ -366,8 +366,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl masm.emit_debugger_breakpoint(); return; } - x: WhammProbe => if (SpcTuning.intrinsifyWhammProbe && WasmFunction.?(x.func)){ - emitWhammProbe(x); + x: WhammProbe => if (SpcTuning.intrinsifyWhammProbe && WasmFunction.?(x.func) && emitWhammProbe(x)) { return; } } @@ -388,7 +387,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl } // saves the overhead of using a runtime call by directly invoking the wasm function associated with the whamm probe - def emitWhammProbe(probe: WhammProbe) { + def emitWhammProbe(probe: WhammProbe) -> bool { // set up args and push to frame slots. var whamm_sig = probe.sig; var offsets = masm.getOffsets(); @@ -405,7 +404,9 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl probe.inline_heuristic_checked = true; probe.spc_swap_instance = inline_config.swap_instance; probe.spc_swap_membase = inline_config.swap_membase; + probe.spc_inline_func = inline_config.can_inline; } + if (!inline_config.can_inline && X86_64DynamicStrategy.?(Execute.tiering)) return false; if (inline_config.swap_instance) { // push whamm instance onto abstract stack directly var whamm_instance_addr = Pointer.atObject(callee_func.instance) - Pointer.NULL; @@ -421,7 +422,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl masm.emit_mov_r_l(regs.mem0_base, i64.view(memobj_addr)); masm.emit_read_v3_mem_base(regs.mem0_base, regs.mem0_base); } - } + } else if (X86_64DynamicStrategy.?(Execute.tiering)) return false; if (!inline_config.can_inline) { state.emitSaveAll(resolver, probeSpillMode); @@ -519,7 +520,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl } } Operand(_, i) => { - var index = orig_sp + u32.view(i); + var index = orig_sp + u32.view(i) - 1; if (inline_config.can_inline) { visit_LOCAL_GET(u31.view(index)); } else { @@ -600,6 +601,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl emit_reload_regs(); if (!probeSpillMode.free_regs) state.emitRestoreAll(resolver); } + return true; } def visit_CRASH_EXEC() { From 8cce635d2e107972f402d0a6968e9c5ecfadb6c1 Mon Sep 17 00:00:00 2001 From: Tom An Date: Mon, 25 Nov 2024 18:13:23 -0500 Subject: [PATCH 03/18] fix inline code iterator bug --- src/engine/BytecodeIterator.v3 | 1 + src/engine/compiler/SinglePassCompiler.v3 | 63 +++++++++++++++-------- src/engine/x86-64/X86_64MasmRegs.v3 | 23 ++++----- 3 files changed, 53 insertions(+), 34 deletions(-) diff --git a/src/engine/BytecodeIterator.v3 b/src/engine/BytecodeIterator.v3 index bc1ccaff..ea392aa5 100644 --- a/src/engine/BytecodeIterator.v3 +++ b/src/engine/BytecodeIterator.v3 @@ -104,6 +104,7 @@ class BytecodeIterator { if (b == InternalOpcode.PROBE.code) { // probe is inserted here v.visitProbe(); b = origptr.reset(func.orig_bytecode, pc, func.orig_bytecode.length).read1(); + codeptr.reset(func.cur_bytecode, pc + 1, func.cur_bytecode.length); } // Query opcode attributes array var opcode: Opcode; diff --git a/src/engine/compiler/SinglePassCompiler.v3 b/src/engine/compiler/SinglePassCompiler.v3 index 1cd30254..5caf049f 100644 --- a/src/engine/compiler/SinglePassCompiler.v3 +++ b/src/engine/compiler/SinglePassCompiler.v3 @@ -9,8 +9,10 @@ class SpcExecEnv { var vfp_slot: MasmAddr; var pc_slot: MasmAddr; var instance_slot: MasmAddr; + var inlined_instance_slot: MasmAddr; var wasm_func_slot: MasmAddr; var mem0_base_slot: MasmAddr; + var inlined_mem0_base_slot: MasmAddr; var accessor_slot: MasmAddr; // Register information. @@ -107,10 +109,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl // push callee's params/locals as needed, thus we need to track the base sp of the locals // in the current context. var local_base_sp: u31 = 0; - // certain inlined functions need access to their instance, which might be different, so we - // simply store the callee instance address on the abstract state, and mark the slot. If the value is - // 0, we use the current instance whenever needed. - var inlined_instance_slot = -1; + var is_inlined = false; new() { masm.unimplemented = unsupported; @@ -300,6 +299,9 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl masm.emit_read_v3_mem_base(regs.mem0_base, regs.mem0_base); masm.emit_mov_m_r(ValueKind.REF, frame.mem0_base_slot, regs.mem0_base); } + //if (func.func_index == 1) { + //emitTrap(TrapReason.UNREACHABLE); + //} } def visitLocalDecl(count: u32, vtc: ValueTypeCode) { var vt = vtc.toAbstractValueType(module); @@ -366,7 +368,8 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl masm.emit_debugger_breakpoint(); return; } - x: WhammProbe => if (SpcTuning.intrinsifyWhammProbe && WasmFunction.?(x.func) && emitWhammProbe(x)) { + x: WhammProbe => if (SpcTuning.intrinsifyWhammProbe && WasmFunction.?(x.func)) { + emitWhammProbe(x); return; } } @@ -387,7 +390,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl } // saves the overhead of using a runtime call by directly invoking the wasm function associated with the whamm probe - def emitWhammProbe(probe: WhammProbe) -> bool { + def emitWhammProbe(probe: WhammProbe) { // set up args and push to frame slots. var whamm_sig = probe.sig; var offsets = masm.getOffsets(); @@ -406,14 +409,10 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl probe.spc_swap_membase = inline_config.swap_membase; probe.spc_inline_func = inline_config.can_inline; } - if (!inline_config.can_inline && X86_64DynamicStrategy.?(Execute.tiering)) return false; if (inline_config.swap_instance) { // push whamm instance onto abstract stack directly var whamm_instance_addr = Pointer.atObject(callee_func.instance) - Pointer.NULL; - var slot_addr = masm.slotAddr(state.sp); - inlined_instance_slot = int.view(state.sp); - state.push(KIND_REF | IS_STORED, NO_REG, 0); - masm.emit_mov_m_l(slot_addr, whamm_instance_addr); + masm.emit_mov_m_l(frame.inlined_instance_slot, whamm_instance_addr); } // overwrite mem0_base with whamm instance's memory base, restore from frame slot later @@ -421,8 +420,9 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl var memobj_addr = Pointer.atObject(callee_func.instance.memories[0]) - Pointer.NULL; masm.emit_mov_r_l(regs.mem0_base, i64.view(memobj_addr)); masm.emit_read_v3_mem_base(regs.mem0_base, regs.mem0_base); + masm.emit_mov_m_r(ValueKind.REF, frame.inlined_mem0_base_slot, regs.mem0_base); } - } else if (X86_64DynamicStrategy.?(Execute.tiering)) return false; + } if (!inline_config.can_inline) { state.emitSaveAll(resolver, probeSpillMode); @@ -458,18 +458,25 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl } else { masm.emit_mov_m_m(ValueKind.REF, slot_addr, MasmAddr(regs.scratch, offsets.X86_64FrameAccessor_metaRef)); } + kind = ValueKind.REF.code; } Val(val) => { var is_v128 = false; var low: u64, high: u64; match (val) { I31(v) => { - if (inline_config.can_inline) state.push(KIND_I32 | IS_CONST, NO_REG, i32.view(v)); + if (inline_config.can_inline) { + var reg = allocRegTos(ValueKind.REF); + masm.emit_mov_r_i(reg, i32.view(v)); + state.push(KIND_REF | IN_REG, reg, 0); + } low = v; + kind = KIND_REF; } I32(v) => { low = v; if (inline_config.can_inline) state.push(KIND_I32 | IS_CONST, NO_REG, i32.view(v)); + kind = KIND_I32; } I64(v) => { low = v; @@ -478,6 +485,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl masm.emit_mov_r_l(reg, i64.view(v)); state.push(KIND_I64 | IN_REG, reg, 0); } + kind = KIND_I64; } F32(v) => { low = v; @@ -486,6 +494,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl masm.emit_mov_r_f32(reg, v); state.push(KIND_F32 | IN_REG, reg, 0); } + kind = KIND_F32; } F64(v) => { low = v; @@ -494,6 +503,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl masm.emit_mov_r_d64(reg, v); state.push(KIND_F64 | IN_REG, reg, 0); } + kind = KIND_F64; } V128(l, h) => { low = l; @@ -504,6 +514,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl masm.emit_mov_r_q(reg, low, high); state.push(KIND_V128 | IN_REG, reg, 0); } + kind = KIND_V128; } Ref(v) => { low = u64.view(Pointer.atObject(v) - Pointer.NULL); @@ -512,6 +523,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl masm.emit_mov_r_l(reg, i64.view(low)); state.push(KIND_REF | IN_REG, reg, 0); } + kind = KIND_REF; } } if (!inline_config.can_inline) { @@ -528,6 +540,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl } else { masm.emit_mov_m_m(ValueKind.REF, slot_addr, masm.slotAddr(index)); } + kind = state.state[index].kind().code; } Local(_, i) => { if (inline_config.can_inline) { @@ -535,9 +548,12 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl } else { masm.emit_mov_m_m(ValueKind.REF, slot_addr, masm.slotAddr(u32.view(i))); } + kind = state.state[u31.view(i)].kind().code; } } - masm.emit_mov_m_i(slot_tag_addr, kind); + if (!inline_config.can_inline) { + masm.emit_mov_m_i(slot_tag_addr, kind); + } } var whamm_instance = callee_func.instance; var func_id = callee_func.decl.func_index; @@ -558,6 +574,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl // inline codegen it.reset(this.func); it.dispatchLocalDecls(this); + this.is_inlined = true; if (Trace.compiler) Trace.OUT.puts("Start compiling inlined whamm probe").ln(); while (it.more() && success) { if (Trace.compiler) traceOpcodeAndStack(false); @@ -578,7 +595,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl // restore spc after inlining it.reset(orig_decl).at(orig_pc); this.local_base_sp = 0; - this.inlined_instance_slot = -1; + this.is_inlined = false; this.module = orig_module; this.func = orig_decl; this.sig = orig_sig; @@ -604,7 +621,6 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl emit_reload_regs(); if (!probeSpillMode.free_regs) state.emitRestoreAll(resolver); } - return true; } def visit_CRASH_EXEC() { @@ -753,7 +769,6 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl // Compute the value stack pointer. emit_compute_vsp(vsp_reg, state.sp); - if (func.imp != null) { // A call to imported function must first check for WasmFunction. masm.emit_br_r(func_reg, MasmBrCond.IS_WASM_FUNC, wasmcall_label); @@ -1464,15 +1479,19 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl // XXX: recompute VFP from VSP - #slots? masm.emit_mov_r_m(ValueKind.REF, regs.vfp, frame.vfp_slot); if (module.memories.length > 0) { - masm.emit_mov_r_m(ValueKind.REF, regs.mem0_base, frame.mem0_base_slot); + if (is_inlined) { + masm.emit_mov_r_m(ValueKind.REF, regs.mem0_base, frame.inlined_mem0_base_slot); + } else { + masm.emit_mov_r_m(ValueKind.REF, regs.mem0_base, frame.mem0_base_slot); + } } } def emit_load_instance(reg: Reg) { - var instance_addr = frame.instance_slot; - if (inlined_instance_slot >= 0) { - instance_addr = masm.slotAddr(u32.view(inlined_instance_slot)); + if (is_inlined) { // inline compilation + masm.emit_mov_r_m(ValueKind.REF, reg, frame.inlined_instance_slot); + } else { + masm.emit_mov_r_m(ValueKind.REF, reg, frame.instance_slot); } - masm.emit_mov_r_m(ValueKind.REF, reg, instance_addr); } def emitLoad(kind: ValueKind, imm: MemArg, meth: (ValueKind, Reg, Reg, Reg, u32) -> ()) { var base_reg = regs.mem0_base; diff --git a/src/engine/x86-64/X86_64MasmRegs.v3 b/src/engine/x86-64/X86_64MasmRegs.v3 index ce5b2717..f4eb4e82 100644 --- a/src/engine/x86-64/X86_64MasmRegs.v3 +++ b/src/engine/x86-64/X86_64MasmRegs.v3 @@ -145,18 +145,17 @@ component X86_64MasmRegs { def m = MasmAddr(xspc.sp, _); - xint.accessor_slot = xspc.accessor_slot = m(X86_64InterpreterFrame.accessor.offset); - xint.instance_slot = xspc.instance_slot = m(X86_64InterpreterFrame.instance.offset); - xint.mem0_base_slot = xspc.mem0_base_slot = m(X86_64InterpreterFrame.mem0_base.offset); - xint.pc_slot = xspc.pc_slot = m(X86_64InterpreterFrame.curpc.offset); - xint.vfp_slot = xspc.vfp_slot = m(X86_64InterpreterFrame.vfp.offset); - xint.vsp_slot = xspc.vsp_slot = m(X86_64InterpreterFrame.vsp.offset); - xint.wasm_func_slot = xspc.wasm_func_slot = m(X86_64InterpreterFrame.wasm_func.offset); - - xint.func_decl_slot = m(X86_64InterpreterFrame.func_decl.offset); - xint.ip_slot = m(X86_64InterpreterFrame.ip.offset); - xint.stp_slot = m(X86_64InterpreterFrame.stp.offset); - xint.wasm_func_slot = m(X86_64InterpreterFrame.wasm_func.offset); + xint.accessor_slot = xspc.accessor_slot = m(X86_64InterpreterFrame.accessor.offset); + xint.instance_slot = xspc.instance_slot = m(X86_64InterpreterFrame.instance.offset); + xint.mem0_base_slot = xspc.mem0_base_slot = m(X86_64InterpreterFrame.mem0_base.offset); + xint.pc_slot = xspc.pc_slot = m(X86_64InterpreterFrame.curpc.offset); + xint.vfp_slot = xspc.vfp_slot = m(X86_64InterpreterFrame.vfp.offset); + xint.vsp_slot = xspc.vsp_slot = m(X86_64InterpreterFrame.vsp.offset); + xint.wasm_func_slot = xspc.wasm_func_slot = m(X86_64InterpreterFrame.wasm_func.offset); + xint.ip_slot = xspc.inlined_mem0_base_slot = m(X86_64InterpreterFrame.ip.offset); + xint.stp_slot = xspc.inlined_instance_slot = m(X86_64InterpreterFrame.stp.offset); + + xint.func_decl_slot = m(X86_64InterpreterFrame.func_decl.offset); xint.code_slot = m(X86_64InterpreterFrame.code.offset); xint.eip_slot = m(X86_64InterpreterFrame.eip.offset); From 075bf018fdc28fc46d59d3821407f52d8e1e2c89 Mon Sep 17 00:00:00 2001 From: Tom An Date: Mon, 25 Nov 2024 18:29:00 -0500 Subject: [PATCH 04/18] update inlining check for visit_END --- src/engine/compiler/SinglePassCompiler.v3 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/engine/compiler/SinglePassCompiler.v3 b/src/engine/compiler/SinglePassCompiler.v3 index 5caf049f..2696d54d 100644 --- a/src/engine/compiler/SinglePassCompiler.v3 +++ b/src/engine/compiler/SinglePassCompiler.v3 @@ -684,7 +684,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl setUnreachable(); } def visit_END() { - if (this.local_base_sp == 0) { + if (!this.is_inlined) { var ctl_top = state.ctl_stack.peek(); if (ctl_top.opcode == Opcode.LOOP.code) { state.ctl_stack.pop(); From 5db8b962371a6e2b52a3f3473e158e2e090538b8 Mon Sep 17 00:00:00 2001 From: Tom An Date: Mon, 25 Nov 2024 18:52:38 -0500 Subject: [PATCH 05/18] fix test failure in dyn mode --- src/engine/compiler/SinglePassCompiler.v3 | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/engine/compiler/SinglePassCompiler.v3 b/src/engine/compiler/SinglePassCompiler.v3 index 2696d54d..9cc368d4 100644 --- a/src/engine/compiler/SinglePassCompiler.v3 +++ b/src/engine/compiler/SinglePassCompiler.v3 @@ -471,12 +471,12 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl state.push(KIND_REF | IN_REG, reg, 0); } low = v; - kind = KIND_REF; + kind = ValueKind.REF.code; } I32(v) => { low = v; if (inline_config.can_inline) state.push(KIND_I32 | IS_CONST, NO_REG, i32.view(v)); - kind = KIND_I32; + kind = ValueKind.I32.code; } I64(v) => { low = v; @@ -485,7 +485,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl masm.emit_mov_r_l(reg, i64.view(v)); state.push(KIND_I64 | IN_REG, reg, 0); } - kind = KIND_I64; + kind = ValueKind.I64.code; } F32(v) => { low = v; @@ -494,7 +494,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl masm.emit_mov_r_f32(reg, v); state.push(KIND_F32 | IN_REG, reg, 0); } - kind = KIND_F32; + kind = ValueKind.F32.code; } F64(v) => { low = v; @@ -503,7 +503,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl masm.emit_mov_r_d64(reg, v); state.push(KIND_F64 | IN_REG, reg, 0); } - kind = KIND_F64; + kind = ValueKind.F64.code; } V128(l, h) => { low = l; @@ -514,7 +514,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl masm.emit_mov_r_q(reg, low, high); state.push(KIND_V128 | IN_REG, reg, 0); } - kind = KIND_V128; + kind = ValueKind.V128.code; } Ref(v) => { low = u64.view(Pointer.atObject(v) - Pointer.NULL); @@ -523,7 +523,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl masm.emit_mov_r_l(reg, i64.view(low)); state.push(KIND_REF | IN_REG, reg, 0); } - kind = KIND_REF; + kind = ValueKind.REF.code; } } if (!inline_config.can_inline) { From c21bf82bc3366dc216f894e42ac517f3a2ff2707 Mon Sep 17 00:00:00 2001 From: Tom An Date: Tue, 26 Nov 2024 20:04:00 -0500 Subject: [PATCH 06/18] whamm probe interpreter trampoline code complete --- src/engine/BytecodeIterator.v3 | 4 +- src/engine/CodePtr.v3 | 2 +- src/engine/Instrumentation.v3 | 4 +- src/engine/Module.v3 | 8 +- src/engine/Opcodes.v3 | 3 +- src/engine/v3/V3Interpreter.v3 | 4 +- src/engine/x86-64/V3Offsets.v3 | 1 + src/engine/x86-64/X86_64Interpreter.v3 | 21 +++ src/engine/x86-64/X86_64PreGenStubs.v3 | 1 + src/engine/x86-64/X86_64Target.v3 | 21 +++ .../x86-64/X86_64WhammProbeTrampoline.v3 | 137 ++++++++++++++++++ src/util/BasicTracing.v3 | 2 +- 12 files changed, 198 insertions(+), 10 deletions(-) create mode 100644 src/engine/x86-64/X86_64WhammProbeTrampoline.v3 diff --git a/src/engine/BytecodeIterator.v3 b/src/engine/BytecodeIterator.v3 index bc1ccaff..c69f1964 100644 --- a/src/engine/BytecodeIterator.v3 +++ b/src/engine/BytecodeIterator.v3 @@ -50,7 +50,7 @@ class BytecodeIterator { // Read the first byte of the code var b = codeptr.read1(); - if (b == InternalOpcode.PROBE.code) { // probe is inserted here + if (b == InternalOpcode.PROBE.code || b == InternalOpcode.WHAMM_PROBE.code) { // probe is inserted here b = origptr.reset(func.orig_bytecode, pc, func.orig_bytecode.length).read1(); } // Query opcode attributes array @@ -101,7 +101,7 @@ class BytecodeIterator { // Read the first byte of the code var b = codeptr.read1(); - if (b == InternalOpcode.PROBE.code) { // probe is inserted here + if (b == InternalOpcode.PROBE.code || b == InternalOpcode.WHAMM_PROBE.code) { // probe is inserted here v.visitProbe(); b = origptr.reset(func.orig_bytecode, pc, func.orig_bytecode.length).read1(); } diff --git a/src/engine/CodePtr.v3 b/src/engine/CodePtr.v3 index a36ece97..cbe61e76 100644 --- a/src/engine/CodePtr.v3 +++ b/src/engine/CodePtr.v3 @@ -14,7 +14,7 @@ class CodePtr extends DataReader { } def read_opcode_but_skip_probe(func: FuncDecl) -> Opcode { var pc = pos, b = read1(); - if (b == InternalOpcode.PROBE.code) b = func.orig_bytecode[pc]; + if (b == InternalOpcode.PROBE.code || b == InternalOpcode.WHAMM_PROBE.code) b = func.orig_bytecode[pc]; var op = Opcodes.opcode_by_prefix[b]; if (op != Opcode.INVALID) return op; return if(Opcodes.isPrefix(b), Opcodes.find(b, read_uleb32())); diff --git a/src/engine/Instrumentation.v3 b/src/engine/Instrumentation.v3 index bbf9f83e..03ec232a 100644 --- a/src/engine/Instrumentation.v3 +++ b/src/engine/Instrumentation.v3 @@ -38,11 +38,12 @@ component Instrumentation { match (probe) { l: ProbeList => { l.add(p); + func.activateProbingAt(offset, InternalOpcode.PROBE.code); Execute.tiering.onFuncProbeInsertN(module, func, offset, p); } null => { map[offset] = p; - func.activateProbingAt(offset); + func.activateProbingAt(offset, if (WhammProbe.?(p), InternalOpcode.WHAMM_PROBE.code, InternalOpcode.PROBE.code)); Execute.tiering.onFuncProbeInsert1(module, func, offset, p); } _ => { @@ -50,6 +51,7 @@ component Instrumentation { list.add(probe); list.add(p); map[offset] = list; + func.activateProbingAt(offset, InternalOpcode.PROBE.code); Execute.tiering.onFuncProbeInsert1(module, func, offset, p); } } diff --git a/src/engine/Module.v3 b/src/engine/Module.v3 index defb8cf4..67688f1c 100644 --- a/src/engine/Module.v3 +++ b/src/engine/Module.v3 @@ -121,6 +121,7 @@ class FuncDecl(sig_index: int) extends Decl { var handlers = NO_HANDLERS; var resume_handlers = NO_HANDLERS; var target_code: TargetCode; + var int_probe_trampoline: TargetCode; var tierup_trigger: int = int.max; def render(names: NameSection, buf: StringBuilder) -> StringBuilder { @@ -131,14 +132,16 @@ class FuncDecl(sig_index: int) extends Decl { def setOrigCode(code: Array) -> this { cur_bytecode = orig_bytecode = code; var tc: TargetCode; + var tr: TargetCode; target_code = tc; // reset target code as well + int_probe_trampoline = tr; sidetable = Sidetables.NO_SIDETABLE; } - def activateProbingAt(pc: int) { + def activateProbingAt(pc: int, probe_byte: byte) { if (pc == 0) return void(entry_probed = true); // special case for function entry // "orig" will become a copy of the original code, to allow in-place modification of old code if (cur_bytecode == orig_bytecode) orig_bytecode = Arrays.dup(orig_bytecode); - cur_bytecode[pc] = InternalOpcode.PROBE.code; + cur_bytecode[pc] = probe_byte; } def deactiveProbingAt(pc: int) { if (pc == 0) return; @@ -160,6 +163,7 @@ class FuncDecl(sig_index: int) extends Decl { n.sidetable = this.sidetable; n.num_locals = this.num_locals; n.target_code = this.target_code; + n.int_probe_trampoline = this.int_probe_trampoline; return n; } def findExHandler(instance: Instance, tag: Tag, throw_pc: int) -> ExHandler { diff --git a/src/engine/Opcodes.v3 b/src/engine/Opcodes.v3 index 3cf7678e..ea03c328 100644 --- a/src/engine/Opcodes.v3 +++ b/src/engine/Opcodes.v3 @@ -677,6 +677,7 @@ component ImmSigs { // Internal opcodes used by the interpreter. enum InternalOpcode(code: u8, mnemonic: string) { PROBE(0x1E, ""), // Used to overwrite a bytecode where a probe has been inserted + WHAMM_PROBE(0x1D, ""), //PROBE_COUNTER //PROBE_COUNTER_n //PROBE_TOS_i @@ -1031,7 +1032,7 @@ class InstrTracer { op = Opcodes.find(b, b2); if (op == Opcode.INVALID) out.put2("%x %x ", b, b2); else out.puts(op.mnemonic); - } else if (b == InternalOpcode.PROBE.code) { + } else if (b == InternalOpcode.PROBE.code || b == InternalOpcode.WHAMM_PROBE.code) { out.put1("", b); return; } else { diff --git a/src/engine/v3/V3Interpreter.v3 b/src/engine/v3/V3Interpreter.v3 index 6b3a2f98..99dd0c36 100644 --- a/src/engine/v3/V3Interpreter.v3 +++ b/src/engine/v3/V3Interpreter.v3 @@ -206,7 +206,7 @@ class V3Interpreter extends WasmStack { // Read the opcode. var b = codeptr.peek1(); var opcode: Opcode; - if (b == InternalOpcode.PROBE.code) { + if (b == InternalOpcode.PROBE.code || b == InternalOpcode.WHAMM_PROBE.code) { // First local probes. var throwable = Instrumentation.fireLocalProbes(DynamicLoc(func, pc, TargetFrame(frame))); if (throwable != null) { @@ -1587,7 +1587,7 @@ class V3Interpreter extends WasmStack { var module = if(frame.func.instance != null, frame.func.instance.module); var opcode = codeptr.data[codeptr.pos]; if (instrTracer == null) instrTracer = InstrTracer.new(); - if (opcode == InternalOpcode.PROBE.code) { + if (opcode == InternalOpcode.PROBE.code || opcode == InternalOpcode.WHAMM_PROBE.code) { OUT.puts(" "); var prev = (codeptr.data, codeptr.pos, codeptr.limit); codeptr.reset(frame.func.decl.orig_bytecode, prev.1, prev.2); diff --git a/src/engine/x86-64/V3Offsets.v3 b/src/engine/x86-64/V3Offsets.v3 index f6f3646a..17a942c3 100644 --- a/src/engine/x86-64/V3Offsets.v3 +++ b/src/engine/x86-64/V3Offsets.v3 @@ -27,6 +27,7 @@ class V3Offsets { def FuncDecl_orig_bytecode = int.view(Pointer.atField(decl.orig_bytecode) - Pointer.atObject(decl)); def FuncDecl_sidetable = int.view(Pointer.atField(decl.sidetable) - Pointer.atObject(decl)); def FuncDecl_target_code = int.view(Pointer.atField(decl.target_code) - Pointer.atObject(decl)); + def FuncDecl_int_probe_trampoline = int.view(Pointer.atField(decl.int_probe_trampoline) - Pointer.atObject(decl)); def FuncDecl_tierup_trigger = int.view(Pointer.atField(decl.tierup_trigger) - Pointer.atObject(decl)); def FuncDecl_entry_probed = int.view(Pointer.atField(decl.entry_probed) - Pointer.atObject(decl)); diff --git a/src/engine/x86-64/X86_64Interpreter.v3 b/src/engine/x86-64/X86_64Interpreter.v3 index 61a377ae..6664cd98 100644 --- a/src/engine/x86-64/X86_64Interpreter.v3 +++ b/src/engine/x86-64/X86_64Interpreter.v3 @@ -2156,6 +2156,27 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) { asm.sub_r_i(origIp, 1); genDispatch0(origIp.indirect(), dispatchTables[0].1, false); } + // specialized handler for whamm probes + writeDispatchEntry(dispatchTables[0].1, InternalOpcode.WHAMM_PROBE.code, w.atEnd().pos); { + computeCurIpFromIp(-1); + computePcFromCurIp(); + saveCallerIVars(); + masm.emit_mov_r_m(ValueKind.REF, xenv.tmp2, MasmAddr(xenv.func_decl, masm.offsets.FuncDecl_int_probe_trampoline)); + masm.emit_jump_r(xenv.tmp2); + + ic.header.whammReentryOffset = w.atEnd().pos; + restoreCallerIVars(); + // Compute a pointer to the original code at this pc offset + var pc = r_tmp1; // = IP - CODE + asm.movq_r_r(pc, r_ip); + asm.sub_r_m(pc, m_code); + var origIp = r_tmp0; // FUNC_DECL.orig_bytecode + pc - 1 + asm.movq_r_m(origIp, r_func_decl.plus(offsets.FuncDecl_orig_bytecode)); + + asm.add_r_r(origIp, pc); + asm.sub_r_i(origIp, 1); + genDispatch0(origIp.indirect(), dispatchTables[0].1, false); + } } def genGlobalProbeSupport() { var offset = w.atEnd().pos; diff --git a/src/engine/x86-64/X86_64PreGenStubs.v3 b/src/engine/x86-64/X86_64PreGenStubs.v3 index 047a4278..58312545 100644 --- a/src/engine/x86-64/X86_64PreGenStubs.v3 +++ b/src/engine/x86-64/X86_64PreGenStubs.v3 @@ -29,6 +29,7 @@ layout X86_64PreGenHeader { +40 oobMemoryHandlerOffset: i32; // handler for signals caused by OOB memory access +44 divZeroHandlerOffset: i32; // handler for signals caused by divide by zero +48 stackOverflowHandlerOffset: i32; // handler for signals caused by (value- or call-) stack overflow + +52 whammReentryOffset: i32; +56 hostCallStubOffset: i32; // host call stub that calls runtime +60 hostCallStubEnd: i32; // host call stub that calls runtime +64 codeEnd: i32; // end of all executable code diff --git a/src/engine/x86-64/X86_64Target.v3 b/src/engine/x86-64/X86_64Target.v3 index dbb9046f..16764c46 100644 --- a/src/engine/x86-64/X86_64Target.v3 +++ b/src/engine/x86-64/X86_64Target.v3 @@ -50,6 +50,21 @@ component Target { f.target_code = TargetCode(addr); Debug.afterCompile(f, u64.view(addr - Pointer.NULL)); } + def setTrampolineCode(f: FuncDecl, addr: Pointer, end: Pointer) { + if (Trace.compiler) { + Trace.OUT.put2("func[%d].target_code: break *0x%x", f.func_index, addr - Pointer.NULL) + .put2(" disass 0x%x, 0x%x", addr - Pointer.NULL, end - Pointer.NULL).ln(); + var cur_byte = addr; + Trace.OUT.puts("JIT code: "); + while (cur_byte < end) { + Trace.OUT.put1("%x ", cur_byte.load()); + cur_byte++; + } + Trace.OUT.ln(); + } + f.int_probe_trampoline = TargetCode(addr); + Debug.afterCompile(f, u64.view(addr - Pointer.NULL)); + } def pregenIntoFile(filename: string) -> ErrorBuilder { var data = System.fileLoad(filename); var err = ErrorBuilder.new().puts("interpreter generator: "); @@ -156,6 +171,12 @@ class X86_64ExecutionStrategy extends ExecutionStrategy { // One tier: fast-int, modules require no pre-processing. class X86_64InterpreterOnlyStrategy extends X86_64ExecutionStrategy { + def call(func: Function, args: Range) -> Result { + if (WasmFunction.?(func)) // eagerly generate JIT compiled trampoline for whamm probes + X86_64WhammTrampoline.gen(WasmFunction.!(func)); + return X86_64StackManager.runOnFreshStack(func, args); + } + def onFuncValidationFinish(module: Module, func: FuncDecl, err: ErrorGen) { if (err != null && !err.ok()) return; Target.setUnconditionalInterpreterEntryIfMultiTier(func); diff --git a/src/engine/x86-64/X86_64WhammProbeTrampoline.v3 b/src/engine/x86-64/X86_64WhammProbeTrampoline.v3 new file mode 100644 index 00000000..cf19c710 --- /dev/null +++ b/src/engine/x86-64/X86_64WhammProbeTrampoline.v3 @@ -0,0 +1,137 @@ +def env = X86_64MasmRegs.INT_EXEC_ENV; +def PAGE_SIZE_i: int = 4096; +def default_target_code: TargetCode; + +component X86_64WhammTrampoline { + def gen(func: WasmFunction) { + if (func.decl.int_probe_trampoline != default_target_code) return; + var it = BytecodeIterator.new(); + var ic = X86_64PreGenStubs.getInterpreterCode(); + + var module = func.instance.module; + var entrypoint: Pointer; + var compiled_trampolines = Array.new(module.functions.length); + var total_size = 0; + for (i < module.functions.length) { + var w = DataWriter.new(); + var masm = X86_64MacroAssembler.new(w, X86_64MasmRegs.CONFIG); + var func_decl = module.functions[i]; + var has_whamm_probe = false; + if (func_decl.imp != null) continue; + it.reset(func_decl); + + // generate trampoline code for each probe + while (it.more()) { + it.current(); + var pc = it.pc; + var probe = Instrumentation.getLocalProbe(module, i, pc); + if (probe != null && WhammProbe.?(probe)) { + var next_label = masm.newLabel(pc * 3 + 1); + masm.emit_brne_r_i(env.curpc, pc, next_label); + genSingleProbe(WhammProbe.!(probe), pc, masm, ic); + masm.bindLabel(next_label); + has_whamm_probe = true; + } + it.next(); + } + if (has_whamm_probe) { + compiled_trampolines[i] = masm; + total_size += w.atEnd().pos; + } + } + if (total_size == 0) return; + allocateCodeForModule(module, total_size); + for (i < module.functions.length) { + if (module.functions[i].imp != null || compiled_trampolines[i] == null) continue; + entrypoint = module.target_module.spc_code.appendCode(compiled_trampolines[i]); + var size = compiled_trampolines[i].w.atEnd().pos; + Target.setTrampolineCode(module.functions[i], entrypoint, entrypoint + size); + } + } +} + +def genSingleProbe(probe: WhammProbe, pc: int, masm: X86_64MacroAssembler, ic: X86_64InterpreterCode) { + var valuerep = masm.valuerep; + var offsets = masm.getOffsets(); + var whamm_sig = probe.sig; + var callee_func = WasmFunction.!(probe.func); + for (i < whamm_sig.length) { + var slot_tag_addr = MasmAddr(env.vsp, i * valuerep.slot_size + valuerep.tag_size); + var slot_addr = MasmAddr(env.vsp, i * valuerep.slot_size); + match(whamm_sig[i]) { + FrameAccessor => { + // check if we have a frame accessor already + var cont_label = masm.newLabel(pc * 3); + masm.emit_mov_r_m(ValueKind.REF, env.scratch, env.accessor_slot); + masm.emit_br_r(env.scratch, MasmBrCond.REF_NONNULL, cont_label); + // special case: requires runtime call to materialize FrameAccessor object + masm.emit_call_runtime_materialize_frame_accessor(); + masm.emit_mov_r_m(ValueKind.REF, env.scratch, env.accessor_slot); + // move result to mem slot + masm.bindLabel(cont_label); + masm.emit_mov_m_m(ValueKind.REF, slot_addr, MasmAddr(env.scratch, offsets.X86_64FrameAccessor_metaRef)); + masm.emit_mov_m_i(slot_tag_addr, ValueKind.REF.code); + } + Val(val) => { + var kind: byte; + var is_v128 = false; + var low: u64, high: u64; + match (val) { + I31(v) => { low = v; kind = ValueKind.REF.code; } + I32(v) => { low = v; kind = ValueKind.I32.code; } + I64(v) => { low = v; kind = ValueKind.I64.code; } + F32(v) => { low = v; kind = ValueKind.F32.code; } + F64(v) => { low = v; kind = ValueKind.F64.code; } + V128(l, h) => { + low = l; + high = h; + is_v128 = true; + kind = ValueKind.V128.code; + } + Ref(val) => { low = u64.view(Pointer.atObject(val) - Pointer.NULL); kind = ValueKind.REF.code; } + } + masm.emit_mov_m_d(slot_addr, low); + if (is_v128) { + masm.emit_mov_m_d(slot_addr.plus(8), high); + } + masm.emit_mov_m_i(slot_tag_addr, kind); + } + Operand(_, i) => { + var src_addr = MasmAddr(env.vsp, (i - 1) * valuerep.slot_size); + var src_tag_addr = MasmAddr(env.vsp, (i - 1) * valuerep.slot_size + valuerep.tag_size); + masm.emit_mov_m_m(ValueKind.REF, slot_addr, src_addr); + masm.emit_mov_m_m(ValueKind.REF, slot_tag_addr, src_tag_addr); + } + Local(_, i) => { + var src_addr = MasmAddr(env.vfp, i * valuerep.slot_size); + var src_tag_addr = MasmAddr(env.vfp, i * valuerep.slot_size + valuerep.tag_size); + masm.emit_mov_m_m(ValueKind.REF, slot_addr, src_addr); + masm.emit_mov_m_m(ValueKind.REF, slot_tag_addr, src_tag_addr); + } + } + } + // update vsp and call the probe function within interpreter + var whamm_instance = callee_func.instance; + var func_id = callee_func.decl.func_index; + masm.emit_addw_r_i(env.vsp, whamm_sig.length * valuerep.slot_size); + masm.asm.movq_r_l(masm.scratch, (ic.start + ic.header.intIntEntryOffset) - Pointer.NULL); + masm.emit_mov_r_l(env.func_arg, Pointer.atObject(whamm_instance.functions[func_id]) - Pointer.NULL); + masm.asm.icall_r(masm.scratch); + + // jump back to whamm probe handler + masm.emit_mov_r_l(env.tmp0, (ic.start + ic.header.whammReentryOffset) - Pointer.NULL); + masm.emit_jump_r(env.tmp0); +} + +def allocateCodeForModule(module: Module, codeSize: int) { + // Round up to the next page size. + var codeSize = PAGE_SIZE_i * ((codeSize + PAGE_SIZE_i - 1) / PAGE_SIZE_i); + // Allocate a read/write/execute mapping for code. + var mapping = Mmap.reserve(u64.!(codeSize), Mmap.PROT_WRITE | Mmap.PROT_READ | Mmap.PROT_EXEC); + var code = X86_64SpcModuleCode.new(mapping); + module.target_module = TargetModule(code); + RiRuntime.registerUserCode(code); + code.keepAlive(); + if (Trace.compiler) Trace.OUT.put3("%s: reserved 0x%x ... 0x%x for trampoline-jit code", + module.filename, (mapping.range.start - Pointer.NULL), (mapping.range.end - Pointer.NULL)).ln(); +} \ No newline at end of file diff --git a/src/util/BasicTracing.v3 b/src/util/BasicTracing.v3 index e20bf859..21a87a8d 100644 --- a/src/util/BasicTracing.v3 +++ b/src/util/BasicTracing.v3 @@ -52,7 +52,7 @@ class TraceInstrProbe extends Probe { var opcode = codeptr.data[codeptr.pos]; OUT.mark(); - if (opcode == InternalOpcode.PROBE.code) { + if (opcode == InternalOpcode.PROBE.code || opcode == InternalOpcode.WHAMM_PROBE.code) { OUT.puts(" "); var prev = (codeptr.data, codeptr.pos, codeptr.limit); codeptr.reset(func.decl.orig_bytecode, prev.1, prev.2); From 079d59506d627bd4dca82f343d43cfc016b69229 Mon Sep 17 00:00:00 2001 From: Tom An Date: Tue, 26 Nov 2024 20:50:41 -0500 Subject: [PATCH 07/18] swapped tag/val location to fix test failures --- src/engine/x86-64/X86_64WhammProbeTrampoline.v3 | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/engine/x86-64/X86_64WhammProbeTrampoline.v3 b/src/engine/x86-64/X86_64WhammProbeTrampoline.v3 index cf19c710..d092154d 100644 --- a/src/engine/x86-64/X86_64WhammProbeTrampoline.v3 +++ b/src/engine/x86-64/X86_64WhammProbeTrampoline.v3 @@ -56,8 +56,8 @@ def genSingleProbe(probe: WhammProbe, pc: int, masm: X86_64MacroAssembler, ic: X var whamm_sig = probe.sig; var callee_func = WasmFunction.!(probe.func); for (i < whamm_sig.length) { - var slot_tag_addr = MasmAddr(env.vsp, i * valuerep.slot_size + valuerep.tag_size); - var slot_addr = MasmAddr(env.vsp, i * valuerep.slot_size); + var slot_tag_addr = MasmAddr(env.vsp, i * valuerep.slot_size); + var slot_addr = MasmAddr(env.vsp, i * valuerep.slot_size + valuerep.tag_size); match(whamm_sig[i]) { FrameAccessor => { // check if we have a frame accessor already @@ -97,14 +97,14 @@ def genSingleProbe(probe: WhammProbe, pc: int, masm: X86_64MacroAssembler, ic: X masm.emit_mov_m_i(slot_tag_addr, kind); } Operand(_, i) => { - var src_addr = MasmAddr(env.vsp, (i - 1) * valuerep.slot_size); - var src_tag_addr = MasmAddr(env.vsp, (i - 1) * valuerep.slot_size + valuerep.tag_size); + var src_addr = MasmAddr(env.vsp, (i - 1) * valuerep.slot_size + valuerep.tag_size); + var src_tag_addr = MasmAddr(env.vsp, (i - 1) * valuerep.slot_size); masm.emit_mov_m_m(ValueKind.REF, slot_addr, src_addr); masm.emit_mov_m_m(ValueKind.REF, slot_tag_addr, src_tag_addr); } Local(_, i) => { - var src_addr = MasmAddr(env.vfp, i * valuerep.slot_size); - var src_tag_addr = MasmAddr(env.vfp, i * valuerep.slot_size + valuerep.tag_size); + var src_addr = MasmAddr(env.vfp, i * valuerep.slot_size + valuerep.tag_size); + var src_tag_addr = MasmAddr(env.vfp, i * valuerep.slot_size); masm.emit_mov_m_m(ValueKind.REF, slot_addr, src_addr); masm.emit_mov_m_m(ValueKind.REF, slot_tag_addr, src_tag_addr); } @@ -114,8 +114,8 @@ def genSingleProbe(probe: WhammProbe, pc: int, masm: X86_64MacroAssembler, ic: X var whamm_instance = callee_func.instance; var func_id = callee_func.decl.func_index; masm.emit_addw_r_i(env.vsp, whamm_sig.length * valuerep.slot_size); - masm.asm.movq_r_l(masm.scratch, (ic.start + ic.header.intIntEntryOffset) - Pointer.NULL); masm.emit_mov_r_l(env.func_arg, Pointer.atObject(whamm_instance.functions[func_id]) - Pointer.NULL); + masm.asm.movq_r_l(masm.scratch, (ic.start + ic.header.intIntEntryOffset) - Pointer.NULL); masm.asm.icall_r(masm.scratch); // jump back to whamm probe handler From 33468075b43ba4a0dafe246ef83c9827554dd8ac Mon Sep 17 00:00:00 2001 From: Tom An Date: Wed, 27 Nov 2024 10:43:19 -0500 Subject: [PATCH 08/18] add flag to toggle feature --- src/engine/Instrumentation.v3 | 5 +++- src/engine/Tuning.v3 | 1 + src/engine/x86-64/X86_64Interpreter.v3 | 40 ++++++++++++++------------ src/engine/x86-64/X86_64Target.v3 | 2 +- 4 files changed, 27 insertions(+), 21 deletions(-) diff --git a/src/engine/Instrumentation.v3 b/src/engine/Instrumentation.v3 index 03ec232a..a3130ddd 100644 --- a/src/engine/Instrumentation.v3 +++ b/src/engine/Instrumentation.v3 @@ -43,7 +43,10 @@ component Instrumentation { } null => { map[offset] = p; - func.activateProbingAt(offset, if (WhammProbe.?(p), InternalOpcode.WHAMM_PROBE.code, InternalOpcode.PROBE.code)); + func.activateProbingAt(offset, + if (WhammProbe.?(p) && FastIntTuning.enableWhammProbeTrampoline, + InternalOpcode.WHAMM_PROBE.code, + InternalOpcode.PROBE.code)); Execute.tiering.onFuncProbeInsert1(module, func, offset, p); } _ => { diff --git a/src/engine/Tuning.v3 b/src/engine/Tuning.v3 index 1ad6a42f..8be7bfe1 100644 --- a/src/engine/Tuning.v3 +++ b/src/engine/Tuning.v3 @@ -34,6 +34,7 @@ component FastIntTuning { def fourByteSidetable = true; // sidetable entries are 4-bytes def entryTierUpDecrement = 1; // "cost" of entering a function in the interpreter def loopTierUpDecrement = 1; // "cost" of looping in the interpreter + def enableWhammProbeTrampoline = true; } // Tuning settings for the single-pass compiler that have no effect on correctness. diff --git a/src/engine/x86-64/X86_64Interpreter.v3 b/src/engine/x86-64/X86_64Interpreter.v3 index 6664cd98..7dedc3db 100644 --- a/src/engine/x86-64/X86_64Interpreter.v3 +++ b/src/engine/x86-64/X86_64Interpreter.v3 @@ -2157,25 +2157,27 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) { genDispatch0(origIp.indirect(), dispatchTables[0].1, false); } // specialized handler for whamm probes - writeDispatchEntry(dispatchTables[0].1, InternalOpcode.WHAMM_PROBE.code, w.atEnd().pos); { - computeCurIpFromIp(-1); - computePcFromCurIp(); - saveCallerIVars(); - masm.emit_mov_r_m(ValueKind.REF, xenv.tmp2, MasmAddr(xenv.func_decl, masm.offsets.FuncDecl_int_probe_trampoline)); - masm.emit_jump_r(xenv.tmp2); - - ic.header.whammReentryOffset = w.atEnd().pos; - restoreCallerIVars(); - // Compute a pointer to the original code at this pc offset - var pc = r_tmp1; // = IP - CODE - asm.movq_r_r(pc, r_ip); - asm.sub_r_m(pc, m_code); - var origIp = r_tmp0; // FUNC_DECL.orig_bytecode + pc - 1 - asm.movq_r_m(origIp, r_func_decl.plus(offsets.FuncDecl_orig_bytecode)); - - asm.add_r_r(origIp, pc); - asm.sub_r_i(origIp, 1); - genDispatch0(origIp.indirect(), dispatchTables[0].1, false); + if (FastIntTuning.enableWhammProbeTrampoline) { + writeDispatchEntry(dispatchTables[0].1, InternalOpcode.WHAMM_PROBE.code, w.atEnd().pos); { + computeCurIpFromIp(-1); + computePcFromCurIp(); + saveCallerIVars(); + masm.emit_mov_r_m(ValueKind.REF, xenv.tmp2, MasmAddr(xenv.func_decl, masm.offsets.FuncDecl_int_probe_trampoline)); + masm.emit_jump_r(xenv.tmp2); + + ic.header.whammReentryOffset = w.atEnd().pos; + restoreCallerIVars(); + // Compute a pointer to the original code at this pc offset + var pc = r_tmp1; // = IP - CODE + asm.movq_r_r(pc, r_ip); + asm.sub_r_m(pc, m_code); + var origIp = r_tmp0; // FUNC_DECL.orig_bytecode + pc - 1 + asm.movq_r_m(origIp, r_func_decl.plus(offsets.FuncDecl_orig_bytecode)); + + asm.add_r_r(origIp, pc); + asm.sub_r_i(origIp, 1); + genDispatch0(origIp.indirect(), dispatchTables[0].1, false); + } } } def genGlobalProbeSupport() { diff --git a/src/engine/x86-64/X86_64Target.v3 b/src/engine/x86-64/X86_64Target.v3 index 16764c46..1f48fecf 100644 --- a/src/engine/x86-64/X86_64Target.v3 +++ b/src/engine/x86-64/X86_64Target.v3 @@ -172,7 +172,7 @@ class X86_64ExecutionStrategy extends ExecutionStrategy { // One tier: fast-int, modules require no pre-processing. class X86_64InterpreterOnlyStrategy extends X86_64ExecutionStrategy { def call(func: Function, args: Range) -> Result { - if (WasmFunction.?(func)) // eagerly generate JIT compiled trampoline for whamm probes + if (WasmFunction.?(func) && FastIntTuning.enableWhammProbeTrampoline) // eagerly generate JIT compiled trampoline for whamm probes X86_64WhammTrampoline.gen(WasmFunction.!(func)); return X86_64StackManager.runOnFreshStack(func, args); } From 6a7f564806776f33663f88ae9af5e0528f8817de Mon Sep 17 00:00:00 2001 From: Tom An Date: Wed, 27 Nov 2024 16:44:40 -0500 Subject: [PATCH 09/18] associate trampoline with probe signature --- src/engine/Module.v3 | 3 - src/engine/x86-64/V3Offsets.v3 | 7 +- src/engine/x86-64/X86_64Interpreter.v3 | 12 +- src/engine/x86-64/X86_64Runtime.v3 | 3 + src/engine/x86-64/X86_64Target.v3 | 22 +- .../x86-64/X86_64WhammProbeTrampoline.v3 | 220 ++++++++---------- src/util/Whamm.v3 | 1 + 7 files changed, 128 insertions(+), 140 deletions(-) diff --git a/src/engine/Module.v3 b/src/engine/Module.v3 index 67688f1c..e5fd7c85 100644 --- a/src/engine/Module.v3 +++ b/src/engine/Module.v3 @@ -121,7 +121,6 @@ class FuncDecl(sig_index: int) extends Decl { var handlers = NO_HANDLERS; var resume_handlers = NO_HANDLERS; var target_code: TargetCode; - var int_probe_trampoline: TargetCode; var tierup_trigger: int = int.max; def render(names: NameSection, buf: StringBuilder) -> StringBuilder { @@ -134,7 +133,6 @@ class FuncDecl(sig_index: int) extends Decl { var tc: TargetCode; var tr: TargetCode; target_code = tc; // reset target code as well - int_probe_trampoline = tr; sidetable = Sidetables.NO_SIDETABLE; } def activateProbingAt(pc: int, probe_byte: byte) { @@ -163,7 +161,6 @@ class FuncDecl(sig_index: int) extends Decl { n.sidetable = this.sidetable; n.num_locals = this.num_locals; n.target_code = this.target_code; - n.int_probe_trampoline = this.int_probe_trampoline; return n; } def findExHandler(instance: Instance, tag: Tag, throw_pc: int) -> ExHandler { diff --git a/src/engine/x86-64/V3Offsets.v3 b/src/engine/x86-64/V3Offsets.v3 index 17a942c3..86461c97 100644 --- a/src/engine/x86-64/V3Offsets.v3 +++ b/src/engine/x86-64/V3Offsets.v3 @@ -17,6 +17,7 @@ class V3Offsets { private def acc = X86_64FrameAccessor.new(vs, Pointer.NULL, decl); private def ha = HeapArray.new(null, []); private def cnt = CountProbe.new(); + private def whamm_Probe = WhammProbe.new(null, []); def Function_sig = int.view(Pointer.atField(wf.sig) - Pointer.atObject(wf)); def WasmFunction_instance = int.view(Pointer.atField(wf.instance) - Pointer.atObject(wf)); @@ -27,9 +28,9 @@ class V3Offsets { def FuncDecl_orig_bytecode = int.view(Pointer.atField(decl.orig_bytecode) - Pointer.atObject(decl)); def FuncDecl_sidetable = int.view(Pointer.atField(decl.sidetable) - Pointer.atObject(decl)); def FuncDecl_target_code = int.view(Pointer.atField(decl.target_code) - Pointer.atObject(decl)); - def FuncDecl_int_probe_trampoline = int.view(Pointer.atField(decl.int_probe_trampoline) - Pointer.atObject(decl)); def FuncDecl_tierup_trigger = int.view(Pointer.atField(decl.tierup_trigger) - Pointer.atObject(decl)); def FuncDecl_entry_probed = int.view(Pointer.atField(decl.entry_probed) - Pointer.atObject(decl)); + def FuncDecl_func_index = int.view(Pointer.atField(decl.func_index) - Pointer.atObject(decl)); def SigDecl_params = int.view(Pointer.atField(sig.params) - Pointer.atObject(sig)); def SigDecl_results = int.view(Pointer.atField(sig.results) - Pointer.atObject(sig)); @@ -40,6 +41,7 @@ class V3Offsets { def Instance_sig_ids = int.view(Pointer.atField(i.sig_ids) - Pointer.atObject(i)); def Instance_dropped_elems = int.view(Pointer.atField(i.dropped_elems) - Pointer.atObject(i)); def Instance_dropped_data = int.view(Pointer.atField(i.dropped_data) - Pointer.atObject(i)); + def Instance_module = int.view(Pointer.atField(i.module) - Pointer.atObject(i)); def Table_funcs = int.view(Pointer.atField(t.funcs) - Pointer.atObject(t)); def Table_elems = int.view(Pointer.atField(t.elems) - Pointer.atObject(t)); @@ -65,7 +67,10 @@ class V3Offsets { def HeapArray_vals = int.view(Pointer.atField(ha.vals) - Pointer.atObject(ha)); def CountProbe_count = int.view(Pointer.atField(cnt.count) - Pointer.atObject(cnt)); + def WhammProbe_trampoline = int.view(Pointer.atField(whamm_Probe.trampoline) - Pointer.atObject(whamm_Probe)); + def WhammProbe_func = int.view(Pointer.atField(whamm_Probe.func) - Pointer.atObject(whamm_Probe)); + def Module_probes = int.view(Pointer.atField(module.probes) - Pointer.atObject(module)); // Try to future-proof for compressed references someday and use REF_SIZE everywhere def REF_SIZE = byte.!(Pointer.atElement(mems, 1) - Pointer.atElement(mems, 0)); def INT_SIZE: byte = 4; diff --git a/src/engine/x86-64/X86_64Interpreter.v3 b/src/engine/x86-64/X86_64Interpreter.v3 index 7dedc3db..1d1aac03 100644 --- a/src/engine/x86-64/X86_64Interpreter.v3 +++ b/src/engine/x86-64/X86_64Interpreter.v3 @@ -2159,12 +2159,20 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) { // specialized handler for whamm probes if (FastIntTuning.enableWhammProbeTrampoline) { writeDispatchEntry(dispatchTables[0].1, InternalOpcode.WHAMM_PROBE.code, w.atEnd().pos); { + // call runtime to get the WhammProbe object computeCurIpFromIp(-1); computePcFromCurIp(); saveCallerIVars(); - masm.emit_mov_r_m(ValueKind.REF, xenv.tmp2, MasmAddr(xenv.func_decl, masm.offsets.FuncDecl_int_probe_trampoline)); - masm.emit_jump_r(xenv.tmp2); + asm.movq_r_m(r_tmp0, m_wasm_func); + callRuntime(refRuntimeCall(RT.runtime_GET_LOCAL_PROBE), [r_tmp0, r_curpc], false); + // jump to the trampoline + asm.movq_r_m(r_vsp, m_vsp); + masm.emit_mov_r_m(ValueKind.REF, xenv.func_arg, MasmAddr(xenv.runtime_ret0, masm.offsets.WhammProbe_func)); + masm.emit_mov_r_m(ValueKind.REF, xenv.runtime_ret0, MasmAddr(xenv.runtime_ret0, masm.offsets.WhammProbe_trampoline)); + masm.emit_jump_r(xenv.runtime_ret0); + + // reentry point from the trampoline ic.header.whammReentryOffset = w.atEnd().pos; restoreCallerIVars(); // Compute a pointer to the original code at this pc offset diff --git a/src/engine/x86-64/X86_64Runtime.v3 b/src/engine/x86-64/X86_64Runtime.v3 index 619c13a0..accda0a2 100644 --- a/src/engine/x86-64/X86_64Runtime.v3 +++ b/src/engine/x86-64/X86_64Runtime.v3 @@ -106,6 +106,9 @@ component X86_64Runtime { if (ret != null) return stack.throw(ret); return ret; } + def runtime_GET_LOCAL_PROBE(func: WasmFunction, pc: int) -> Probe { + return func.instance.module.probes[func.decl.func_index][pc]; + } def runtime_materialize_frame_accessor() { var rsp = CiRuntime.callerSp(); var frame = TargetFrame(rsp); diff --git a/src/engine/x86-64/X86_64Target.v3 b/src/engine/x86-64/X86_64Target.v3 index 1f48fecf..c98e2b2e 100644 --- a/src/engine/x86-64/X86_64Target.v3 +++ b/src/engine/x86-64/X86_64Target.v3 @@ -50,21 +50,6 @@ component Target { f.target_code = TargetCode(addr); Debug.afterCompile(f, u64.view(addr - Pointer.NULL)); } - def setTrampolineCode(f: FuncDecl, addr: Pointer, end: Pointer) { - if (Trace.compiler) { - Trace.OUT.put2("func[%d].target_code: break *0x%x", f.func_index, addr - Pointer.NULL) - .put2(" disass 0x%x, 0x%x", addr - Pointer.NULL, end - Pointer.NULL).ln(); - var cur_byte = addr; - Trace.OUT.puts("JIT code: "); - while (cur_byte < end) { - Trace.OUT.put1("%x ", cur_byte.load()); - cur_byte++; - } - Trace.OUT.ln(); - } - f.int_probe_trampoline = TargetCode(addr); - Debug.afterCompile(f, u64.view(addr - Pointer.NULL)); - } def pregenIntoFile(filename: string) -> ErrorBuilder { var data = System.fileLoad(filename); var err = ErrorBuilder.new().puts("interpreter generator: "); @@ -172,8 +157,6 @@ class X86_64ExecutionStrategy extends ExecutionStrategy { // One tier: fast-int, modules require no pre-processing. class X86_64InterpreterOnlyStrategy extends X86_64ExecutionStrategy { def call(func: Function, args: Range) -> Result { - if (WasmFunction.?(func) && FastIntTuning.enableWhammProbeTrampoline) // eagerly generate JIT compiled trampoline for whamm probes - X86_64WhammTrampoline.gen(WasmFunction.!(func)); return X86_64StackManager.runOnFreshStack(func, args); } @@ -184,6 +167,11 @@ class X86_64InterpreterOnlyStrategy extends X86_64ExecutionStrategy { def onNewFunction(wf: WasmFunction, err: ErrorGen) { Target.setUnconditionalInterpreterEntryIfMultiTier(wf.decl); } + + def onFuncProbeInsert1(module: Module, func: FuncDecl, offset: int, p: Probe) { + if (FastIntTuning.enableWhammProbeTrampoline && WhammProbe.?(p)) + X86_64WhammTrampoline.genSingleProbe(WhammProbe.!(p), X86_64PreGenStubs.getInterpreterCode()); + } } // Base class of all strategies that use SPC. diff --git a/src/engine/x86-64/X86_64WhammProbeTrampoline.v3 b/src/engine/x86-64/X86_64WhammProbeTrampoline.v3 index d092154d..c406c119 100644 --- a/src/engine/x86-64/X86_64WhammProbeTrampoline.v3 +++ b/src/engine/x86-64/X86_64WhammProbeTrampoline.v3 @@ -1,137 +1,123 @@ def env = X86_64MasmRegs.INT_EXEC_ENV; def PAGE_SIZE_i: int = 4096; -def default_target_code: TargetCode; +def default_target_code: X86_64SpcModuleCode; +var trampoline_code: X86_64SpcModuleCode; +var trampoline_entries: List<(Array, Pointer)>; component X86_64WhammTrampoline { - def gen(func: WasmFunction) { - if (func.decl.int_probe_trampoline != default_target_code) return; - var it = BytecodeIterator.new(); - var ic = X86_64PreGenStubs.getInterpreterCode(); - - var module = func.instance.module; - var entrypoint: Pointer; - var compiled_trampolines = Array.new(module.functions.length); - var total_size = 0; - for (i < module.functions.length) { + def genSingleProbe(probe: WhammProbe, ic: X86_64InterpreterCode) { + if (trampoline_code == default_target_code) allocateCodeForTrampoline(); + var whamm_sig = probe.sig; + var entry_ptr = getEntry(whamm_sig); + if (entry_ptr == Pointer.NULL) { var w = DataWriter.new(); var masm = X86_64MacroAssembler.new(w, X86_64MasmRegs.CONFIG); - var func_decl = module.functions[i]; - var has_whamm_probe = false; - if (func_decl.imp != null) continue; - it.reset(func_decl); - - // generate trampoline code for each probe - while (it.more()) { - it.current(); - var pc = it.pc; - var probe = Instrumentation.getLocalProbe(module, i, pc); - if (probe != null && WhammProbe.?(probe)) { - var next_label = masm.newLabel(pc * 3 + 1); - masm.emit_brne_r_i(env.curpc, pc, next_label); - genSingleProbe(WhammProbe.!(probe), pc, masm, ic); - masm.bindLabel(next_label); - has_whamm_probe = true; + var valuerep = masm.valuerep; + var offsets = masm.getOffsets(); + for (i < whamm_sig.length) { + var slot_tag_addr = MasmAddr(env.vsp, i * valuerep.slot_size); + var slot_addr = MasmAddr(env.vsp, i * valuerep.slot_size + valuerep.tag_size); + match(whamm_sig[i]) { + FrameAccessor => { + // check if we have a frame accessor already + var cont_label = masm.newLabel(0); + masm.emit_mov_r_m(ValueKind.REF, env.scratch, env.accessor_slot); + masm.emit_br_r(env.scratch, MasmBrCond.REF_NONNULL, cont_label); + // special case: requires runtime call to materialize FrameAccessor object + masm.emit_call_runtime_materialize_frame_accessor(); + masm.emit_mov_r_m(ValueKind.REF, env.scratch, env.accessor_slot); + // move result to mem slot + masm.bindLabel(cont_label); + masm.emit_mov_m_m(ValueKind.REF, slot_addr, MasmAddr(env.scratch, offsets.X86_64FrameAccessor_metaRef)); + masm.emit_mov_m_i(slot_tag_addr, ValueKind.REF.code); + } + Val(val) => { + var kind: byte; + var is_v128 = false; + var low: u64, high: u64; + match (val) { + I31(v) => { low = v; kind = ValueKind.REF.code; } + I32(v) => { low = v; kind = ValueKind.I32.code; } + I64(v) => { low = v; kind = ValueKind.I64.code; } + F32(v) => { low = v; kind = ValueKind.F32.code; } + F64(v) => { low = v; kind = ValueKind.F64.code; } + V128(l, h) => { + low = l; + high = h; + is_v128 = true; + kind = ValueKind.V128.code; + } + Ref(val) => { low = u64.view(Pointer.atObject(val) - Pointer.NULL); kind = ValueKind.REF.code; } + } + masm.emit_mov_m_d(slot_addr, low); + if (is_v128) { + masm.emit_mov_m_d(slot_addr.plus(8), high); + } + masm.emit_mov_m_i(slot_tag_addr, kind); + } + Operand(_, i) => { + var src_addr = MasmAddr(env.vsp, (i - 1) * valuerep.slot_size + valuerep.tag_size); + var src_tag_addr = MasmAddr(env.vsp, (i - 1) * valuerep.slot_size); + masm.emit_mov_m_m(ValueKind.REF, slot_addr, src_addr); + masm.emit_mov_m_m(ValueKind.REF, slot_tag_addr, src_tag_addr); + } + Local(_, i) => { + var src_addr = MasmAddr(env.vfp, i * valuerep.slot_size + valuerep.tag_size); + var src_tag_addr = MasmAddr(env.vfp, i * valuerep.slot_size); + masm.emit_mov_m_m(ValueKind.REF, slot_addr, src_addr); + masm.emit_mov_m_m(ValueKind.REF, slot_tag_addr, src_tag_addr); + } } - it.next(); } - if (has_whamm_probe) { - compiled_trampolines[i] = masm; - total_size += w.atEnd().pos; - } - } - if (total_size == 0) return; - allocateCodeForModule(module, total_size); - for (i < module.functions.length) { - if (module.functions[i].imp != null || compiled_trampolines[i] == null) continue; - entrypoint = module.target_module.spc_code.appendCode(compiled_trampolines[i]); - var size = compiled_trampolines[i].w.atEnd().pos; - Target.setTrampolineCode(module.functions[i], entrypoint, entrypoint + size); + // update vsp and call the probe function within interpreter + masm.emit_addw_r_i(env.vsp, whamm_sig.length * valuerep.slot_size); + masm.asm.movq_r_l(masm.scratch, (ic.start + ic.header.intIntEntryOffset) - Pointer.NULL); + masm.asm.icall_r(masm.scratch); + + // jump back to whamm probe handler + masm.emit_mov_r_l(env.tmp0, (ic.start + ic.header.whammReentryOffset) - Pointer.NULL); + masm.emit_jump_r(env.tmp0); + var addr = setTrampolineCode(masm); + trampoline_entries = List<(Array, Pointer)>.new((whamm_sig, addr), trampoline_entries); + entry_ptr = addr; } + probe.trampoline = TargetCode(entry_ptr); } } -def genSingleProbe(probe: WhammProbe, pc: int, masm: X86_64MacroAssembler, ic: X86_64InterpreterCode) { - var valuerep = masm.valuerep; - var offsets = masm.getOffsets(); - var whamm_sig = probe.sig; - var callee_func = WasmFunction.!(probe.func); - for (i < whamm_sig.length) { - var slot_tag_addr = MasmAddr(env.vsp, i * valuerep.slot_size); - var slot_addr = MasmAddr(env.vsp, i * valuerep.slot_size + valuerep.tag_size); - match(whamm_sig[i]) { - FrameAccessor => { - // check if we have a frame accessor already - var cont_label = masm.newLabel(pc * 3); - masm.emit_mov_r_m(ValueKind.REF, env.scratch, env.accessor_slot); - masm.emit_br_r(env.scratch, MasmBrCond.REF_NONNULL, cont_label); - // special case: requires runtime call to materialize FrameAccessor object - masm.emit_call_runtime_materialize_frame_accessor(); - masm.emit_mov_r_m(ValueKind.REF, env.scratch, env.accessor_slot); - // move result to mem slot - masm.bindLabel(cont_label); - masm.emit_mov_m_m(ValueKind.REF, slot_addr, MasmAddr(env.scratch, offsets.X86_64FrameAccessor_metaRef)); - masm.emit_mov_m_i(slot_tag_addr, ValueKind.REF.code); - } - Val(val) => { - var kind: byte; - var is_v128 = false; - var low: u64, high: u64; - match (val) { - I31(v) => { low = v; kind = ValueKind.REF.code; } - I32(v) => { low = v; kind = ValueKind.I32.code; } - I64(v) => { low = v; kind = ValueKind.I64.code; } - F32(v) => { low = v; kind = ValueKind.F32.code; } - F64(v) => { low = v; kind = ValueKind.F64.code; } - V128(l, h) => { - low = l; - high = h; - is_v128 = true; - kind = ValueKind.V128.code; - } - Ref(val) => { low = u64.view(Pointer.atObject(val) - Pointer.NULL); kind = ValueKind.REF.code; } - } - masm.emit_mov_m_d(slot_addr, low); - if (is_v128) { - masm.emit_mov_m_d(slot_addr.plus(8), high); - } - masm.emit_mov_m_i(slot_tag_addr, kind); - } - Operand(_, i) => { - var src_addr = MasmAddr(env.vsp, (i - 1) * valuerep.slot_size + valuerep.tag_size); - var src_tag_addr = MasmAddr(env.vsp, (i - 1) * valuerep.slot_size); - masm.emit_mov_m_m(ValueKind.REF, slot_addr, src_addr); - masm.emit_mov_m_m(ValueKind.REF, slot_tag_addr, src_tag_addr); - } - Local(_, i) => { - var src_addr = MasmAddr(env.vfp, i * valuerep.slot_size + valuerep.tag_size); - var src_tag_addr = MasmAddr(env.vfp, i * valuerep.slot_size); - masm.emit_mov_m_m(ValueKind.REF, slot_addr, src_addr); - masm.emit_mov_m_m(ValueKind.REF, slot_tag_addr, src_tag_addr); - } - } +def getEntry(sig: Array) -> Pointer { + for (entry = trampoline_entries; entry != null; entry = entry.tail) { + if (Arrays.equal(entry.head.0, sig)) return entry.head.1; } - // update vsp and call the probe function within interpreter - var whamm_instance = callee_func.instance; - var func_id = callee_func.decl.func_index; - masm.emit_addw_r_i(env.vsp, whamm_sig.length * valuerep.slot_size); - masm.emit_mov_r_l(env.func_arg, Pointer.atObject(whamm_instance.functions[func_id]) - Pointer.NULL); - masm.asm.movq_r_l(masm.scratch, (ic.start + ic.header.intIntEntryOffset) - Pointer.NULL); - masm.asm.icall_r(masm.scratch); - - // jump back to whamm probe handler - masm.emit_mov_r_l(env.tmp0, (ic.start + ic.header.whammReentryOffset) - Pointer.NULL); - masm.emit_jump_r(env.tmp0); + return Pointer.NULL; } -def allocateCodeForModule(module: Module, codeSize: int) { - // Round up to the next page size. - var codeSize = PAGE_SIZE_i * ((codeSize + PAGE_SIZE_i - 1) / PAGE_SIZE_i); +def allocateCodeForTrampoline() { + // Allocate 10 pages to ensure that we have enough space for all trampoline code. + var code_size = PAGE_SIZE_i * 10; // Allocate a read/write/execute mapping for code. - var mapping = Mmap.reserve(u64.!(codeSize), Mmap.PROT_WRITE | Mmap.PROT_READ | Mmap.PROT_EXEC); + var mapping = Mmap.reserve(u64.!(code_size), Mmap.PROT_WRITE | Mmap.PROT_READ | Mmap.PROT_EXEC); var code = X86_64SpcModuleCode.new(mapping); - module.target_module = TargetModule(code); RiRuntime.registerUserCode(code); code.keepAlive(); - if (Trace.compiler) Trace.OUT.put3("%s: reserved 0x%x ... 0x%x for trampoline-jit code", - module.filename, (mapping.range.start - Pointer.NULL), (mapping.range.end - Pointer.NULL)).ln(); + if (Trace.compiler) Trace.OUT.put2("Reserved 0x%x ... 0x%x for WhammProbe trampoline jit code", + (mapping.range.start - Pointer.NULL), (mapping.range.end - Pointer.NULL)).ln(); + trampoline_code = code; +} + +def setTrampolineCode(masm: X86_64MacroAssembler) -> Pointer { + var addr = trampoline_code.appendCode(masm); + var end = addr + masm.w.atEnd().pos; + if (Trace.compiler) { + Trace.OUT.put1("Single WhammProbe trampoline code: break *0x%x", addr - Pointer.NULL) + .put2(" disass 0x%x, 0x%x", addr - Pointer.NULL, end - Pointer.NULL).ln(); + var cur_byte = addr; + Trace.OUT.puts("JIT code: "); + while (cur_byte < end) { + Trace.OUT.put1("%x ", cur_byte.load()); + cur_byte++; + } + Trace.OUT.ln(); + } + return addr; } \ No newline at end of file diff --git a/src/util/Whamm.v3 b/src/util/Whamm.v3 index 4e0dce02..4d943948 100644 --- a/src/util/Whamm.v3 +++ b/src/util/Whamm.v3 @@ -90,6 +90,7 @@ component Whamm { // A probe that adapts a Wasm function to be called by the engine-internal probing mechanism. class WhammProbe(func: Function, sig: Array) extends Probe { + var trampoline: TargetCode; // properties set by the spc to make inlining optimization decisions. var inline_heuristic_checked = false; var spc_inline_func = false; From 79604df0d6a0fc7593b9d55eb675e3ef40787299 Mon Sep 17 00:00:00 2001 From: Tom An Date: Wed, 27 Nov 2024 17:58:28 -0500 Subject: [PATCH 10/18] resolve merge conflict --- src/engine/x86-64/X86_64WhammProbeTrampoline.v3 | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/src/engine/x86-64/X86_64WhammProbeTrampoline.v3 b/src/engine/x86-64/X86_64WhammProbeTrampoline.v3 index c406c119..e98c372a 100644 --- a/src/engine/x86-64/X86_64WhammProbeTrampoline.v3 +++ b/src/engine/x86-64/X86_64WhammProbeTrampoline.v3 @@ -19,16 +19,8 @@ component X86_64WhammTrampoline { var slot_addr = MasmAddr(env.vsp, i * valuerep.slot_size + valuerep.tag_size); match(whamm_sig[i]) { FrameAccessor => { - // check if we have a frame accessor already - var cont_label = masm.newLabel(0); - masm.emit_mov_r_m(ValueKind.REF, env.scratch, env.accessor_slot); - masm.emit_br_r(env.scratch, MasmBrCond.REF_NONNULL, cont_label); - // special case: requires runtime call to materialize FrameAccessor object - masm.emit_call_runtime_materialize_frame_accessor(); - masm.emit_mov_r_m(ValueKind.REF, env.scratch, env.accessor_slot); - // move result to mem slot - masm.bindLabel(cont_label); - masm.emit_mov_m_m(ValueKind.REF, slot_addr, MasmAddr(env.scratch, offsets.X86_64FrameAccessor_metaRef)); + masm.emit_call_runtime_getFrameAccessorMetaRef(); + masm.emit_mov_m_r(ValueKind.REF, slot_addr, env.runtime_ret0); masm.emit_mov_m_i(slot_tag_addr, ValueKind.REF.code); } Val(val) => { From 1c2502773c45d8513daa12fecbe65ae8ef20eb0e Mon Sep 17 00:00:00 2001 From: Tom An Date: Wed, 27 Nov 2024 18:00:17 -0500 Subject: [PATCH 11/18] inline tier --- src/engine/compiler/SinglePassCompiler.v3 | 331 +++++++++++++++++----- 1 file changed, 255 insertions(+), 76 deletions(-) diff --git a/src/engine/compiler/SinglePassCompiler.v3 b/src/engine/compiler/SinglePassCompiler.v3 index 0e0c68a6..370a41dc 100644 --- a/src/engine/compiler/SinglePassCompiler.v3 +++ b/src/engine/compiler/SinglePassCompiler.v3 @@ -88,7 +88,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl def state = SpcState.new(regAlloc); // Other state def trap_labels = Vector<(TrapReason, MasmLabel)>.new(); - var start_pos = 0; + //var start_pos = 0; var module: Module; var func: FuncDecl; var sig: SigDecl; @@ -103,6 +103,15 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl var last_probe = 0; var skip_to_end: bool; + // when function is inlined, we continue using caller's abstract state, and + // push callee's params/locals as needed, thus we need to track the base sp of the locals + // in the current context. + var local_base_sp: u31 = 0; + // certain inlined functions need access to their instance, which might be different, so we + // simply store the callee instance address on the abstract state, and mark the slot. If the value is + // 0, we use the current instance whenever needed. + var inlined_instance_slot = -1; + new() { masm.unimplemented = unsupported; masm.newTrapLabel = newTrapLabel; // trap labels are per-pc @@ -159,6 +168,11 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl last_probe = 0; masm.source_loc = it.pc; it.dispatch(this); + if (Trace.compiler) { + OUT.puts("JIT code: "); + masm.printCodeBytes(OUT); + OUT.ln(); + } unrefRegs(); if (Debug.compiler) checkRegAlloc(); it.next(); @@ -352,11 +366,9 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl masm.emit_debugger_breakpoint(); return; } - x: WhammProbe => { - if (SpcTuning.intrinsifyWhammProbe && WasmFunction.?(x.func)) { - emitWhammProbe(x); - return; - } + x: WhammProbe => if (SpcTuning.intrinsifyWhammProbe && WasmFunction.?(x.func)){ + emitWhammProbe(x); + return; } } // spill everything @@ -377,12 +389,46 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl // saves the overhead of using a runtime call by directly invoking the wasm function associated with the whamm probe def emitWhammProbe(probe: WhammProbe) { - // spill entire value stack. - state.emitSaveAll(resolver, probeSpillMode); // set up args and push to frame slots. var whamm_sig = probe.sig; var offsets = masm.getOffsets(); + var inline_config = InlineConfig(false, false, false); + var new_local_base_sp = 0; + var orig_sp = state.sp; var callee_func = WasmFunction.!(probe.func); + + if (SpcTuning.inlineSmallFunc) { + // TODO: can reuse when implementing inlining for SPC + inline_config = InlineConfig(probe.spc_swap_membase, probe.spc_swap_instance, probe.spc_inline_func); + if (!probe.inline_heuristic_checked) { + inline_config = funcCanInline(callee_func.decl); + probe.inline_heuristic_checked = true; + probe.spc_swap_instance = inline_config.swap_instance; + probe.spc_swap_membase = inline_config.swap_membase; + } + + if (inline_config.swap_instance) { // push whamm instance onto abstract stack directly + var whamm_instance_addr = Pointer.atObject(callee_func.instance) - Pointer.NULL; + var slot_addr = masm.slotAddr(state.sp); + inlined_instance_slot = int.view(state.sp); + state.push(KIND_REF | IS_STORED, NO_REG, 0); + masm.emit_mov_m_l(slot_addr, whamm_instance_addr); + } + + // overwrite mem0_base with whamm instance's memory base, restore from frame slot later + if (inline_config.swap_membase) { + var memobj_addr = Pointer.atObject(callee_func.instance.memories[0]) - Pointer.NULL; + masm.emit_mov_r_l(regs.mem0_base, i64.view(memobj_addr)); + masm.emit_read_v3_mem_base(regs.mem0_base, regs.mem0_base); + } + } + + if (!inline_config.can_inline) { + state.emitSaveAll(resolver, probeSpillMode); + } else { + new_local_base_sp = int.view(state.sp); + } + for (i < whamm_sig.length) { var slot_tag_addr = masm.tagAddr(state.sp + u32.view(i)); var slot_addr = masm.slotAddr(state.sp + u32.view(i)); @@ -396,67 +442,167 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl masm.emit_br_r(regs.scratch, MasmBrCond.REF_NONNULL, cont_label); // special case: requires runtime call to materialize FrameAccessor object + if (inline_config.can_inline) state.emitSaveAll(resolver, probeSpillMode); // spill entire value stack. masm.emit_call_runtime_materialize_frame_accessor(); masm.emit_mov_r_m(ValueKind.REF, regs.scratch, frame.accessor_slot); + emit_reload_regs(); + if (inline_config.can_inline && !probeSpillMode.free_regs) state.emitRestoreAll(resolver); - // move result to mem slot + // move result to mem slot or reg, depending on inlining masm.bindLabel(cont_label); - masm.emit_mov_m_m(ValueKind.REF, slot_addr, MasmAddr(regs.scratch, offsets.X86_64FrameAccessor_metaRef)); - kind = ValueKind.REF.code; + if (inline_config.can_inline) { + var reg = allocRegTos(ValueKind.REF); + masm.emit_mov_r_m(ValueKind.REF, reg, MasmAddr(regs.scratch, offsets.X86_64FrameAccessor_metaRef)); + state.push(KIND_REF | IN_REG, reg, 0); + } else { + masm.emit_mov_m_m(ValueKind.REF, slot_addr, MasmAddr(regs.scratch, offsets.X86_64FrameAccessor_metaRef)); + } } Val(val) => { var is_v128 = false; var low: u64, high: u64; match (val) { - I31(v) => { low = v; kind = ValueKind.REF.code; } - I32(v) => { low = v; kind = ValueKind.I32.code; } - I64(v) => { low = v; kind = ValueKind.I64.code; } - F32(v) => { low = v; kind = ValueKind.F32.code; } - F64(v) => { low = v; kind = ValueKind.F64.code; } + I31(v) => { + if (inline_config.can_inline) state.push(KIND_I32 | IS_CONST, NO_REG, i32.view(v)); + low = v; + } + I32(v) => { + low = v; + if (inline_config.can_inline) state.push(KIND_I32 | IS_CONST, NO_REG, i32.view(v)); + } + I64(v) => { + low = v; + if (inline_config.can_inline) { + var reg = allocRegTos(ValueKind.I64); + masm.emit_mov_r_l(reg, i64.view(v)); + state.push(KIND_I64 | IN_REG, reg, 0); + } + } + F32(v) => { + low = v; + if (inline_config.can_inline) { + var reg = allocRegTos(ValueKind.F32); + masm.emit_mov_r_f32(reg, v); + state.push(KIND_F32 | IN_REG, reg, 0); + } + } + F64(v) => { + low = v; + if (inline_config.can_inline) { + var reg = allocRegTos(ValueKind.F64); + masm.emit_mov_r_d64(reg, v); + state.push(KIND_F64 | IN_REG, reg, 0); + } + } V128(l, h) => { low = l; high = h; is_v128 = true; - kind = ValueKind.V128.code; + if (inline_config.can_inline) { + var reg = allocRegTos(ValueKind.V128); + masm.emit_mov_r_q(reg, low, high); + state.push(KIND_V128 | IN_REG, reg, 0); + } + } + Ref(v) => { + low = u64.view(Pointer.atObject(v) - Pointer.NULL); + if (inline_config.can_inline) { + var reg = allocRegTos(ValueKind.REF); + masm.emit_mov_r_l(reg, i64.view(low)); + state.push(KIND_REF | IN_REG, reg, 0); + } } - Ref(val) => { low = u64.view(Pointer.atObject(val) - Pointer.NULL); kind = ValueKind.REF.code; } } - masm.emit_mov_m_d(slot_addr, low); - if (is_v128) { - masm.emit_mov_m_d(slot_addr.plus(8), high); + if (!inline_config.can_inline) { + masm.emit_mov_m_d(slot_addr, low); + if (is_v128) { + masm.emit_mov_m_d(slot_addr.plus(8), high); + } } } Operand(_, i) => { - var slot = state.sp + u32.view(i) - 1; - kind = state.state[slot].kind().code; - masm.emit_mov_m_m(ValueKind.REF, slot_addr, masm.slotAddr(slot)); + var index = orig_sp + u32.view(i); + if (inline_config.can_inline) { + visit_LOCAL_GET(u31.view(index)); + } else { + masm.emit_mov_m_m(ValueKind.REF, slot_addr, masm.slotAddr(index)); + } } Local(_, i) => { - var slot = u32.view(i); - kind = state.state[slot].kind().code; - masm.emit_mov_m_m(ValueKind.REF, slot_addr, masm.slotAddr(slot)); + if (inline_config.can_inline) { + visit_LOCAL_GET(u31.view(i)); + } else { + masm.emit_mov_m_m(ValueKind.REF, slot_addr, masm.slotAddr(u32.view(i))); + } } } masm.emit_mov_m_i(slot_tag_addr, kind); } var whamm_instance = callee_func.instance; var func_id = callee_func.decl.func_index; + var whamm_module = whamm_instance.module; + var whamm_func_decl = callee_func.decl; + if (inline_config.can_inline) { + var orig_decl = it.func; + var orig_pc = it.pc; + var orig_module = module; + var orig_sig = sig; + + // prepare spc for inlining + this.local_base_sp = u31.view(new_local_base_sp); + this.module = whamm_module; + this.func = whamm_func_decl; + this.sig = whamm_func_decl.sig; + + // inline codegen + it.reset(this.func); + it.dispatchLocalDecls(this); + if (Trace.compiler) Trace.OUT.puts("Start compiling inlined whamm probe").ln(); + while (it.more() && success) { + if (Trace.compiler) traceOpcodeAndStack(false); + last_probe = 0; + masm.source_loc = it.pc; + it.dispatch(this); + if (Trace.compiler) { + OUT.puts("JIT code: "); + masm.printCodeBytes(OUT); + OUT.ln(); + } + unrefRegs(); + if (Debug.compiler) checkRegAlloc(); + it.next(); + } + if (Trace.compiler) Trace.OUT.puts("Finished compiling inlined whamm probe").ln(); + + // restore spc after inlining + it.reset(orig_decl).at(orig_pc); + this.local_base_sp = 0; + this.inlined_instance_slot = -1; + this.module = orig_module; + this.func = orig_decl; + this.sig = orig_sig; + if (inline_config.swap_membase) { + masm.emit_mov_r_m(ValueKind.REF, regs.mem0_base, frame.mem0_base_slot); + } - var vsp_reg = allocTmpFixed(ValueKind.REF, regs.vsp); - var func_reg = allocTmpFixed(ValueKind.REF, regs.func_arg); - var tmp = allocTmp(ValueKind.REF); - - // Load the target code/entrypoint. - masm.emit_mov_r_l(func_reg, Pointer.atObject(whamm_instance.functions[func_id]) - Pointer.NULL); - masm.emit_mov_r_m(ValueKind.REF, tmp, MasmAddr(func_reg, offsets.WasmFunction_decl)); - masm.emit_mov_r_m(ValueKind.REF, tmp, MasmAddr(tmp, offsets.FuncDecl_target_code)); - // adjust vsp_reg to compute the "true" VSP, accounting for args to WhammProbe's WasmFunction - emit_compute_vsp(vsp_reg, state.sp + u32.view(whamm_sig.length)); - // Call to the entrypoint. - masm.emit_call_r(tmp); - emit_unwind_check(); - emit_reload_regs(); - if (!probeSpillMode.free_regs) state.emitRestoreAll(resolver); + // clear callee params/locals from abstract state + dropN(state.sp - orig_sp); + } else { + var vsp_reg = allocTmpFixed(ValueKind.REF, regs.vsp); + var func_reg = allocTmpFixed(ValueKind.REF, regs.func_arg); + var tmp = allocTmp(ValueKind.REF); + + // Load the target code/entrypoint. + masm.emit_mov_r_l(func_reg, Pointer.atObject(whamm_instance.functions[func_id]) - Pointer.NULL); + masm.emit_mov_r_m(ValueKind.REF, tmp, MasmAddr(func_reg, offsets.WasmFunction_decl)); + masm.emit_mov_r_m(ValueKind.REF, tmp, MasmAddr(tmp, offsets.FuncDecl_target_code)); + // adjust vsp_reg to compute the "true" VSP, accounting for args to WhammProbe's WasmFunction + emit_compute_vsp(vsp_reg, state.sp + u32.view(whamm_sig.length)); + // Call to the entrypoint. + masm.emit_call_r(tmp); + emit_reload_regs(); + if (!probeSpillMode.free_regs) state.emitRestoreAll(resolver); + } } def visit_CRASH_EXEC() { @@ -520,35 +666,37 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl setUnreachable(); } def visit_END() { - var ctl_top = state.ctl_stack.peek(); - if (ctl_top.opcode == Opcode.LOOP.code) { - state.ctl_stack.pop(); - if (!ctl_top.reachable) setUnreachable(); - } else if (ctl_top.opcode == Opcode.IF.code) { - // simulate empty if-true block - state.emitFallthru(resolver); - masm.emit_br(ctl_top.label); - masm.bindLabel(ctl_top.else_label); - state.doElse(); - ctl_top.opcode = Opcode.ELSE.code; - state.emitFallthru(resolver); - masm.bindLabel(ctl_top.label); - state.resetToMerge(ctl_top); - state.ctl_stack.pop(); - } else if (ctl_top.opcode == Opcode.BLOCK.code || ctl_top.opcode == Opcode.ELSE.code) { - state.emitFallthru(resolver); - masm.bindLabel(ctl_top.label); - state.resetToMerge(ctl_top); - state.ctl_stack.pop(); - } else if (ctl_top.opcode == Opcode.RETURN.code) { - state.emitFallthru(resolver); - masm.bindLabel(ctl_top.label); - state.resetToMerge(ctl_top); + if (this.local_base_sp == 0) { + var ctl_top = state.ctl_stack.peek(); + if (ctl_top.opcode == Opcode.LOOP.code) { + state.ctl_stack.pop(); + if (!ctl_top.reachable) setUnreachable(); + } else if (ctl_top.opcode == Opcode.IF.code) { + // simulate empty if-true block + state.emitFallthru(resolver); + masm.emit_br(ctl_top.label); + masm.bindLabel(ctl_top.else_label); + state.doElse(); + ctl_top.opcode = Opcode.ELSE.code; + state.emitFallthru(resolver); + masm.bindLabel(ctl_top.label); + state.resetToMerge(ctl_top); + state.ctl_stack.pop(); + } else if (ctl_top.opcode == Opcode.BLOCK.code || ctl_top.opcode == Opcode.ELSE.code) { + state.emitFallthru(resolver); + masm.bindLabel(ctl_top.label); + state.resetToMerge(ctl_top); + state.ctl_stack.pop(); + } else if (ctl_top.opcode == Opcode.RETURN.code) { + state.emitFallthru(resolver); + masm.bindLabel(ctl_top.label); + state.resetToMerge(ctl_top); + emitProbe(); + if (ctl_top.merge_count > 1) emitReturn(ctl_top); + state.ctl_stack.pop(); + } emitProbe(); - if (ctl_top.merge_count > 1) emitReturn(ctl_top); - state.ctl_stack.pop(); } - emitProbe(); } def visit_BR(depth: u31) { var target = state.getControl(depth); @@ -824,6 +972,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl dropN(valcount); } def visit_LOCAL_GET(index: u31) { + index = index + local_base_sp; var lv = state.get(index); if (lv.inReg()) { regAlloc.assign(lv.reg, int.!(state.sp)); @@ -841,6 +990,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl } } def visit_LOCAL_SET(index: u31) { + index = index + local_base_sp; var lv = state.get(index); var sv = state.pop(); if (sv.inReg()) { @@ -865,6 +1015,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl } } def visit_LOCAL_TEE(index: u31) { + index = index + local_base_sp; var lv = state.get(index); regAlloc.unassign(lv.reg, index); // unref existing register var sv = state.peek(); @@ -955,7 +1106,6 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl if (i32.view(val) == val) { state.push(KIND_I64 | IS_CONST, NO_REG, i32.view(val)); } else { - var tos = state.sp; var reg = allocRegTos(ValueKind.I64); masm.emit_mov_r_l(reg, val); state.push(KIND_I64 | IN_REG, reg, 0); @@ -1316,7 +1466,11 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl } } def emit_load_instance(reg: Reg) { - masm.emit_mov_r_m(ValueKind.REF, reg, frame.instance_slot); + var instance_addr = frame.instance_slot; + if (inlined_instance_slot >= 0) { + instance_addr = masm.slotAddr(u32.view(inlined_instance_slot)); + } + masm.emit_mov_r_m(ValueKind.REF, reg, instance_addr); } def emitLoad(kind: ValueKind, imm: MemArg, meth: (ValueKind, Reg, Reg, Reg, u32) -> ()) { var base_reg = regs.mem0_base; @@ -1546,11 +1700,8 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl } def traceOpcode(orig: bool) { OUT.flush(); - var pc = it.pc - start_pos; instrTracer.instr_width = Opcodes.longestName + 1; - instrTracer.putPcAndInstr(OUT, module, func, pc, orig); - OUT.puts("JIT code: "); - masm.printCodeBytes(OUT); + instrTracer.putPcAndInstr(OUT, module, func, it.pc, orig); OUT.ln(); } } @@ -2196,4 +2347,32 @@ class MoveNode { var src: MoveNode; // source of the value for this node var dstList: MoveNode; // head of destination list var dstNext: MoveNode; // next in a list of successors -} \ No newline at end of file +} + +// checks function bytecode to see if it can be inlined based on +// simple heuristics: length <= 50 and straightline code. +def funcCanInline(decl: FuncDecl) -> InlineConfig { + var default = InlineConfig(false, false, false); + if (decl.orig_bytecode.length > 50 || decl.sig.params.length > 10) return default; + var bi = BytecodeIterator.new().reset(decl); + var swap_instance = false; + var swap_membase = false; + while (bi.more()) { + var op = bi.current(); + match (op) { + IF, BR, BR_IF, BR_TABLE, BR_ON_NULL, BR_ON_NON_NULL, BR_ON_CAST, BR_ON_CAST_FAIL, RETURN => return default; + THROW, CALL, CALL_INDIRECT, MEMORY_INIT, MEMORY_SIZE, MEMORY_GROW, MEMORY_COPY, MEMORY_FILL, REF_FUNC, DATA_DROP, + ELEM_DROP, TABLE_INIT, TABLE_SIZE, TABLE_COPY, TABLE_GROW, GLOBAL_SET, GLOBAL_GET, TABLE_SET, TABLE_GET => swap_instance = true; + I32_STORE, I64_STORE, F32_STORE, F64_STORE, I32_STORE8, I32_STORE16, I64_STORE8, I64_STORE16, I64_STORE32, + V128_STORE, I32_LOAD, I64_LOAD, F32_LOAD, F64_LOAD, I32_LOAD8_S, I32_LOAD8_U, I32_LOAD16_S, I32_LOAD16_U, + I64_LOAD8_S, I64_LOAD8_U, I64_LOAD16_S, I64_LOAD16_U, I64_LOAD32_S, I64_LOAD32_U, V128_LOAD => { + swap_membase = true; + } + _ => ; + } + bi.next(); + } + return InlineConfig(swap_membase, swap_instance, true); +} + +type InlineConfig(swap_membase: bool, swap_instance: bool, can_inline: bool); From eb99fe0e939b47356a293e6869e38d5e67e05c19 Mon Sep 17 00:00:00 2001 From: Tom An Date: Wed, 27 Nov 2024 18:40:20 -0500 Subject: [PATCH 12/18] clean up dead comments --- src/engine/compiler/SinglePassCompiler.v3 | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/engine/compiler/SinglePassCompiler.v3 b/src/engine/compiler/SinglePassCompiler.v3 index 54d4db4c..684fc108 100644 --- a/src/engine/compiler/SinglePassCompiler.v3 +++ b/src/engine/compiler/SinglePassCompiler.v3 @@ -90,7 +90,6 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl def state = SpcState.new(regAlloc); // Other state def trap_labels = Vector<(TrapReason, MasmLabel)>.new(); - //var start_pos = 0; var module: Module; var func: FuncDecl; var sig: SigDecl; @@ -302,9 +301,6 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl masm.emit_read_v3_mem_base(regs.mem0_base, regs.mem0_base); masm.emit_mov_m_r(ValueKind.REF, frame.mem0_base_slot, regs.mem0_base); } - //if (func.func_index == 1) { - //emitTrap(TrapReason.UNREACHABLE); - //} } def visitLocalDecl(count: u32, vtc: ValueTypeCode) { var vt = vtc.toAbstractValueType(module); From 8cb5fb818d3e00187c604b6f01d760797743d52f Mon Sep 17 00:00:00 2001 From: Tom An Date: Tue, 10 Dec 2024 12:58:20 -0500 Subject: [PATCH 13/18] implement read/write probe intrinsification --- src/engine/Tuning.v3 | 1 + src/engine/compiler/MacroAssembler.v3 | 9 ++- src/engine/compiler/SinglePassCompiler.v3 | 95 ++++++++++++++++++++++- src/engine/x86-64/X86_64MacroAssembler.v3 | 58 ++++++++++---- src/monitors/R3Monitor.v3 | 58 +++++++++++++- src/util/ProbeUtil.v3 | 84 ++++++++++++++++++++ 6 files changed, 285 insertions(+), 20 deletions(-) diff --git a/src/engine/Tuning.v3 b/src/engine/Tuning.v3 index 52d1f9f5..352ba246 100644 --- a/src/engine/Tuning.v3 +++ b/src/engine/Tuning.v3 @@ -54,4 +54,5 @@ component SpcTuning { var inlineSmallFunc = true; // inline small functions, currently only applicable for whamm probes def probeCallFreesRegs = true; // probe calls frees registers in abstract state def runtimeCallFreesRegs = true; // runtime calls frees registers in abstract state + def intrinsifyMemoryProbes = true; } diff --git a/src/engine/compiler/MacroAssembler.v3 b/src/engine/compiler/MacroAssembler.v3 index cf238b81..61e40436 100644 --- a/src/engine/compiler/MacroAssembler.v3 +++ b/src/engine/compiler/MacroAssembler.v3 @@ -165,6 +165,7 @@ class MacroAssembler(valuerep: Tagging, regConfig: RegConfig) { } // Architecture-specific load and store routines for Wasm load/store. + // if `intrinsify_probe` is true, writes the effective address to scratch register. def emit_loadbsx_r_r_r_i(kind: ValueKind, dst: Reg, base: Reg, index: Reg, offset: u32); def emit_loadbzx_r_r_r_i(kind: ValueKind, dst: Reg, base: Reg, index: Reg, offset: u32); def emit_loadwsx_r_r_r_i(kind: ValueKind, dst: Reg, base: Reg, index: Reg, offset: u32); @@ -173,9 +174,9 @@ class MacroAssembler(valuerep: Tagging, regConfig: RegConfig) { def emit_loaddzx_r_r_r_i(kind: ValueKind, dst: Reg, base: Reg, index: Reg, offset: u32); def emit_load_r_r_r_i(kind: ValueKind, dst: Reg, base: Reg, index: Reg, offset: u32); - def emit_storeb_r_r_r_i(kind: ValueKind, val: Reg, base: Reg, index: Reg, offset: u32); - def emit_storew_r_r_r_i(kind: ValueKind, val: Reg, base: Reg, index: Reg, offset: u32); - def emit_store_r_r_r_i(kind: ValueKind, val: Reg, base: Reg, index: Reg, offset: u32); + def emit_storeb_r_r_r_i(kind: ValueKind, val: Reg, base: Reg, mirror_base: Reg, index: Reg, offset: u32); + def emit_storew_r_r_r_i(kind: ValueKind, val: Reg, base: Reg, mirror_base: Reg, index: Reg, offset: u32); + def emit_store_r_r_r_i(kind: ValueKind, val: Reg, base: Reg, mirror_base: Reg, index: Reg, offset: u32); def emit_mov_r_r(kind: ValueKind, reg: Reg, reg2: Reg); def emit_mov_r_m(kind: ValueKind, reg: Reg, addr: MasmAddr); @@ -246,6 +247,8 @@ class MacroAssembler(valuerep: Tagging, regConfig: RegConfig) { def emit_call_runtime_getFrameAccessorMetaRef(); def emit_increment_CountProbe(tmp: Reg, probe: CountProbe, increment: u64); def emit_call_OperandProbe_i_v_fire(probe: OperandProbe_i_v, value_reg: Reg); + def emit_call_MemoryReadProbe_fire(probe: MemoryReadProbe); + def emit_call_MemoryWriteProbe_fire(probe: MemoryWriteProbe); def emit_debugger_breakpoint(); diff --git a/src/engine/compiler/SinglePassCompiler.v3 b/src/engine/compiler/SinglePassCompiler.v3 index 684fc108..3f8498f2 100644 --- a/src/engine/compiler/SinglePassCompiler.v3 +++ b/src/engine/compiler/SinglePassCompiler.v3 @@ -112,6 +112,9 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl // tracks the last masm writer offset to generate instruction trace for each bytecode. var codegen_offset: u64 = 0; + var intrinsified_read_probe: MemoryReadProbe = null; + var intrinsified_write_probe: MemoryWriteProbe = null; + new() { masm.unimplemented = unsupported; masm.newTrapLabel = newTrapLabel; // trap labels are per-pc @@ -331,6 +334,14 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl // Check for intrinsified probes. match (probe) { // TODO: emit code for multiple intrinsified probes. null => ; + x: MemoryReadProbe => if (SpcTuning.intrinsifyMemoryProbes && x.size <= 8) { + intrinsified_read_probe = x; + return; + } + x: MemoryWriteProbe => if (SpcTuning.intrinsifyMemoryProbes && x.size <= 8) { + intrinsified_write_probe = x; + return; + } x: CountProbe => if (SpcTuning.intrinsifyCountProbe) { // TODO: check for subclass override var tmp = allocTmp(ValueKind.REF); masm.emit_increment_CountProbe(tmp, x, 1); @@ -1510,8 +1521,33 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl var nflags = IN_REG | SpcConsts.kindToFlags(kind); if (kind == ValueKind.I32) nflags |= (iv.flags & TAG_STORED); // tag may already be stored for index state.push(nflags, dest, 0); + + if (intrinsified_read_probe != null) { + // spill everything + state.emitSaveAll(resolver, probeSpillMode); + // compute VSP for potential frame access + emit_compute_vsp(regs.vsp, state.sp); + emit_spill_vsp(regs.vsp); + masm.emit_store_curstack_vsp(regs.vsp); + + // load RT args (addr, val) and call RT + if (index_reg.index == 0) { // fixed addr + index_reg = allocTmp(ValueKind.I32); + masm.emit_mov_r_i(index_reg, i32.!(offset)); + } else { + masm.emit_addw_r_i(index_reg, i32.!(offset)); + } + var arg1 = masm.getV3ParamReg(ValueKind.REF, 1); + var arg2 = masm.getV3ParamReg(ValueKind.REF, 2); + masm.emit_mov_r_r(ValueKind.REF, arg1, index_reg); + masm.emit_mov_r_m(kind, arg2, masm.slotAddr(state.sp - 1)); + masm.emit_call_MemoryReadProbe_fire(intrinsified_read_probe); + emit_reload_regs(); + if (!probeSpillMode.free_regs) state.emitRestoreAll(resolver); + intrinsified_read_probe = null; + } } - def emitStore(kind: ValueKind, imm: MemArg, meth: (ValueKind, Reg, Reg, Reg, u32) -> ()) { + def emitStore(kind: ValueKind, imm: MemArg, meth: (ValueKind, Reg, Reg, Reg, Reg, u32) -> ()) { var base_reg = regs.mem0_base; if (imm.memory_index != 0) { // XXX: cache the base register for memories > 0 @@ -1534,7 +1570,62 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl } else { index_reg = popReg().reg; } - meth(kind, sv.reg, base_reg, index_reg, u32.!(offset)); // TODO: memory64 + var mirror_base: Reg; + if (intrinsified_write_probe != null) { + if (intrinsified_write_probe.has_fire_probe) { + // temporarily push values back in order to preserve the register values across RT call + if (index_reg.index != 0) + state.push(KIND_I32 | IN_REG, index_reg, 0); + state.push((byte.view(sv.kind().tag) << 4) | IN_REG, sv.reg, 0); + + // spill everything + state.emitSaveAll(resolver, probeSpillMode); + // compute VSP for potential frame access + emit_compute_vsp(regs.vsp, state.sp); + emit_spill_vsp(regs.vsp); + masm.emit_store_curstack_vsp(regs.vsp); + + // load RT args (addr, val) and call RT + if (index_reg.index == 0) { // fixed addr + index_reg = allocTmp(ValueKind.I32); + masm.emit_mov_r_i(index_reg, i32.!(offset)); + } else { + masm.emit_addw_r_i(index_reg, i32.!(offset)); + } + var arg1 = masm.getV3ParamReg(ValueKind.REF, 1); + var arg2 = masm.getV3ParamReg(ValueKind.REF, 2); + masm.emit_mov_r_r(ValueKind.REF, arg1, index_reg); + masm.emit_mov_r_m(kind, arg2, masm.slotAddr(state.sp - 1)); + masm.emit_call_MemoryWriteProbe_fire(intrinsified_write_probe); + emit_reload_regs(); + if (!probeSpillMode.free_regs) state.emitRestoreAll(resolver); + // restore abstract state + if (index_reg.index != 0) pop(); + pop(); + } + + if (intrinsified_write_probe.writeMirror != null) { // setup write mirror base + var mirror_mem_addr = Pointer.atObject(intrinsified_write_probe.writeMirror) - Pointer.NULL; + mirror_base = allocTmp(ValueKind.REF); + masm.emit_mov_r_l(mirror_base, mirror_mem_addr); + masm.emit_read_v3_mem_base(mirror_base, mirror_base); + } + intrinsified_write_probe = null; + } + meth(kind, sv.reg, base_reg, mirror_base, index_reg, u32.!(offset)); // TODO: memory64 + } + + def emitIntrinsifiedMemoryReadProbe() { + // spill everything + state.emitSaveAll(resolver, probeSpillMode); + // compute VSP for potential frame access + emit_compute_vsp(regs.vsp, state.sp); + emit_spill_vsp(regs.vsp); + masm.emit_store_curstack_vsp(regs.vsp); + + + emit_reload_regs(); + if (!probeSpillMode.free_regs) state.emitRestoreAll(resolver); } //==================================================================== diff --git a/src/engine/x86-64/X86_64MacroAssembler.v3 b/src/engine/x86-64/X86_64MacroAssembler.v3 index eeaaa376..4591c61f 100644 --- a/src/engine/x86-64/X86_64MacroAssembler.v3 +++ b/src/engine/x86-64/X86_64MacroAssembler.v3 @@ -152,11 +152,11 @@ class X86_64MacroAssembler extends MacroAssembler { var b = G(base), t = handle_large_offset(index, offset); recordCurSourceLoc(); match (kind) { - I32 => asm.movd_r_m(G(dst), X86_64Addr.new(b, t.0, 1, t.1)); - REF, I64 => asm.movq_r_m(G(dst), X86_64Addr.new(b, t.0, 1, t.1)); - F32 => asm.movss_s_m(X(dst), X86_64Addr.new(b, t.0, 1, t.1)); - F64 => asm.movsd_s_m(X(dst), X86_64Addr.new(b, t.0, 1, t.1)); - V128 => asm.movdqu_s_m(X(dst), X86_64Addr.new(b, t.0, 1, t.1)); + I32 => asm.movd_r_m(G(dst), X86_64Addr.new(G(base), t.0, 1, t.1)); + REF, I64 => asm.movq_r_m(G(dst), X86_64Addr.new(G(base), t.0, 1, t.1)); + F32 => asm.movss_s_m(X(dst), X86_64Addr.new(G(base), t.0, 1, t.1)); + F64 => asm.movsd_s_m(X(dst), X86_64Addr.new(G(base), t.0, 1, t.1)); + V128 => asm.movdqu_s_m(X(dst), X86_64Addr.new(G(base), t.0, 1, t.1)); } } def emit_v128_load_lane_r_m(dst: Reg, src: X86_64Addr, asm_mov_r_m: (X86_64Gpr, X86_64Addr) -> T) { @@ -171,25 +171,41 @@ class X86_64MacroAssembler extends MacroAssembler { var t = handle_large_offset(index, offset); return X86_64Addr.new(G(base), t.0, 1, t.1); } - def emit_storeb_r_r_r_i(kind: ValueKind, val: Reg, base: Reg, index: Reg, offset: u32) { + def emit_storeb_r_r_r_i(kind: ValueKind, val: Reg, base: Reg, mirror_base: Reg, index: Reg, offset: u32) { var t = handle_large_offset(index, offset); recordCurSourceLoc(); + if (mirror_base.index != 0) { + asm.q.movb_m_r(X86_64Addr.new(G(mirror_base), t.0, 1, t.1), G(val)); + } asm.q.movb_m_r(X86_64Addr.new(G(base), t.0, 1, t.1), G(val)); } - def emit_storew_r_r_r_i(kind: ValueKind, val: Reg, base: Reg, index: Reg, offset: u32) { + def emit_storew_r_r_r_i(kind: ValueKind, val: Reg, base: Reg, mirror_base: Reg, index: Reg, offset: u32) { var t = handle_large_offset(index, offset); recordCurSourceLoc(); + if (mirror_base.index != 0) { + asm.q.movw_m_r(X86_64Addr.new(G(mirror_base), t.0, 1, t.1), G(val)); + } asm.q.movw_m_r(X86_64Addr.new(G(base), t.0, 1, t.1), G(val)); } - def emit_store_r_r_r_i(kind: ValueKind, val: Reg, base: Reg, index: Reg, offset: u32) { + def emit_store_r_r_r_i(kind: ValueKind, val: Reg, base: Reg, mirror_base: Reg, index: Reg, offset: u32) { var b = G(base), t = handle_large_offset(index, offset); recordCurSourceLoc(); + var addr = X86_64Addr.new(G(base), t.0, 1, t.1); + if (mirror_base.index != 0) { + match (kind) { + I32 => asm.movd_m_r(X86_64Addr.new(G(mirror_base), t.0, 1, t.1), G(val)); + REF, I64 => asm.movq_m_r(X86_64Addr.new(G(mirror_base), t.0, 1, t.1), G(val)); + F32 => asm.movss_m_s(X86_64Addr.new(G(mirror_base), t.0, 1, t.1), X(val)); + F64 => asm.movsd_m_s(X86_64Addr.new(G(mirror_base), t.0, 1, t.1), X(val)); + V128 => asm.movdqu_m_s(X86_64Addr.new(G(mirror_base), t.0, 1, t.1), X(val)); + } + } match (kind) { - I32 => asm.movd_m_r(X86_64Addr.new(b, t.0, 1, t.1), G(val)); - REF, I64 => asm.movq_m_r(X86_64Addr.new(b, t.0, 1, t.1), G(val)); - F32 => asm.movss_m_s(X86_64Addr.new(b, t.0, 1, t.1), X(val)); - F64 => asm.movsd_m_s(X86_64Addr.new(b, t.0, 1, t.1), X(val)); - V128 => asm.movdqu_m_s(X86_64Addr.new(b, t.0, 1, t.1), X(val)); + I32 => asm.movd_m_r(X86_64Addr.new(G(base), t.0, 1, t.1), G(val)); + REF, I64 => asm.movq_m_r(X86_64Addr.new(G(base), t.0, 1, t.1), G(val)); + F32 => asm.movss_m_s(X86_64Addr.new(G(base), t.0, 1, t.1), X(val)); + F64 => asm.movsd_m_s(X86_64Addr.new(G(base), t.0, 1, t.1), X(val)); + V128 => asm.movdqu_m_s(X86_64Addr.new(G(base), t.0, 1, t.1), X(val)); } } @@ -723,6 +739,22 @@ class X86_64MacroAssembler extends MacroAssembler { asm.icall_r(scratch); recordRetSourceLoc(); } + def emit_call_MemoryReadProbe_fire(probe: MemoryReadProbe) { + var codePtr = CiRuntime.unpackClosure(probe.fire_probe).0; + var refOffset = asm.movq_r_p(Target.V3_PARAM_GPRS[0], Pointer.atObject(probe) - Pointer.NULL); + addEmbeddedRefOffset(refOffset); + asm.movq_r_l(scratch, codePtr - Pointer.NULL); // XXX: make direct call to runtime if within 2GB + asm.icall_r(scratch); + recordRetSourceLoc(); + } + def emit_call_MemoryWriteProbe_fire(probe: MemoryWriteProbe) { + var codePtr = CiRuntime.unpackClosure(probe.fire_probe).0; + var refOffset = asm.movq_r_p(Target.V3_PARAM_GPRS[0], Pointer.atObject(probe) - Pointer.NULL); + addEmbeddedRefOffset(refOffset); + asm.movq_r_l(scratch, codePtr - Pointer.NULL); // XXX: make direct call to runtime if within 2GB + asm.icall_r(scratch); + recordRetSourceLoc(); + } def emit_call_HostCallStub() { var ic = X86_64PreGenStubs.getInterpreterCode(); asm.movq_r_l(scratch, (ic.start + ic.header.hostCallStubOffset) - Pointer.NULL); diff --git a/src/monitors/R3Monitor.v3 b/src/monitors/R3Monitor.v3 index 2daa4ec0..96e4bd78 100644 --- a/src/monitors/R3Monitor.v3 +++ b/src/monitors/R3Monitor.v3 @@ -26,8 +26,6 @@ class R3Monitor extends Monitor { var mm = ModuleInstrumenter.new(module); R3MonitorBytecodeInstrumenter.new(handler, module).runMatching(filterReplayFunctions); mm.forEachFuncMatching(filterReplayFunctions, instrumentFunctionEnter); - mm.beforeMemReadMatching(filterReplayFunctions, handler.onMemoryRead); - mm.beforeMemWriteMatching(filterReplayFunctions, handler.onMemoryWrite); mm.beforeMemGrowMatching(filterReplayFunctions, handler.onMemoryGrow); } @@ -62,6 +60,14 @@ private class R3MonitorBytecodeInstrumenter extends BytecodeInstrumenter { handler = handler; } + def visitLoad(op: Opcode, imm: MemArg, size: u8) { + checkCallReturnAndInsertProbe(R3MemoryReadProbe.new(handler, imm, size)); + } + + def visitStore(op: Opcode, imm: MemArg, size: u8) { + checkCallReturnAndInsertProbe(R3MemoryWriteProbe.new(handler, imm, size)); + } + def visitOp(op: Opcode) { if (last_op_is_call) { insertProbeHere(ReturnProbe.new(handler)); @@ -613,6 +619,54 @@ private class MemoryFillProbe(handler: EventHandler, mem_index: u31) extends Pro } } +private class R3MemoryWriteProbe extends MemoryWriteProbe { + def var handler: EventHandler; + + new(handler: EventHandler, imm: MemArg, size: u8) { + this.imm = imm; + this.size = size; + this.handler = handler; + this.writeMirror = handler.shadowMems[imm.memory_index]; + this.has_fire_probe = false; + } +} + +private class R3MemoryReadProbe extends MemoryReadProbe { + def var handler: EventHandler; + + new(handler: EventHandler, imm: MemArg, size: u8) { + this.imm = imm; + this.size = size; + this.handler = handler; + } + + def fire_probe(addr: u64, val_lower: u64, val_upper: u64) { + var size = u32.view(size); + var shadow_mem = handler.shadowMems[imm.memory_index]; + var shadow_ptr = shadow_mem.range_ol_64(addr, size).result; + var shadow_data_lo: u64, shadow_data_hi: u64; + match (size) { + 1 => shadow_data_lo = DataReaders.read_range_u8(shadow_ptr); + 2 => shadow_data_lo = DataReaders.read_range_u16(shadow_ptr); + 4 => shadow_data_lo = DataReaders.read_range_u32(shadow_ptr); + 8 => shadow_data_lo = DataReaders.read_range_u64(shadow_ptr); + 16 => { + var shadow_result = DataReaders.read_range_u128(shadow_ptr); + shadow_data_lo = shadow_result.0; + shadow_data_hi = shadow_result.1; + } + _ => ; + } + var val_range = Array.new(16); + DataWriters.write_range_u64(val_range, val_lower); + DataWriters.write_range_u64(val_range[8 ... 16], val_upper); + if (shadow_data_lo != val_lower || (size == 16 && (shadow_data_hi != val_upper))) { + handler.trace.put(WasmEvent.Load(imm.memory_index, addr, Arrays.range(val_range, 0, int.view(size)))); + shadow_mem.copyIn(u32.view(addr), val_range, 0, size); + } + } +} + private type CallStackEntry(func_id: int, kind: CallKind); private enum CallKind {INT, EXT} diff --git a/src/util/ProbeUtil.v3 b/src/util/ProbeUtil.v3 index a9877f26..d9668f8e 100644 --- a/src/util/ProbeUtil.v3 +++ b/src/util/ProbeUtil.v3 @@ -103,3 +103,87 @@ class ExternalDebuggerBreakpointProbe extends Probe { return Resumption.Continue; // TODO: currently only has effect in SPC code } } + +// A probe that instruments memory read, intrinsified by SPC. +class MemoryReadProbe extends Probe { + var imm: MemArg; + var size: u8; + // fallback invocation when not intrinsified. + def fire(dynamicLoc: DynamicLoc) -> Resumption { + var accessor = dynamicLoc.frame.getFrameAccessor(); + var address: u64 = Values.unbox_u(accessor.getTopOfStack()); + address += imm.offset; + var memory = dynamicLoc.func.instance.memories[imm.memory_index]; + var size = u32.view(size); + var mem_ptr_trap = memory.range_ol_64(address, size); + if (mem_ptr_trap.trapped()) + return Resumption.Trap(mem_ptr_trap.reason, true); + var low: u64, high: u64; + var mem_ptr = mem_ptr_trap.result; + match (size) { + 1 => { + low = DataReaders.read_range_u8(mem_ptr); + } + 2 => { + low = DataReaders.read_range_u16(mem_ptr); + } + 4 => { + low = DataReaders.read_range_u32(mem_ptr); + } + 8 => { + low = DataReaders.read_range_u64(mem_ptr); + } + 16 => { + var data_result = DataReaders.read_range_u128(mem_ptr); + low = data_result.0; + high = data_result.1; + } + _ => ; + } + fire_probe(address, low, high); + return Resumption.Continue; + } + def fire_probe(addr: u64, val_lower: u64, val_upper: u64) {} +} + +// A probe that instruments memory write, intrinsified by SPC. +// if writeMirror is set, any probed writes to the main memory will also be performed on this mirror. +class MemoryWriteProbe extends Probe { + var imm: MemArg, size: u8, writeMirror: Memory, has_fire_probe: bool; + + // fallback invocation when not intrinsified. + def fire(dynamicLoc: DynamicLoc) -> Resumption { + var accessor = dynamicLoc.frame.getFrameAccessor(); + var address: u64 = Values.unbox_u(accessor.getOperand(-1)); + address += imm.offset; + var size = u32.view(size); + var newval_lower: u64, newval_upper: u64; + match (accessor.getTopOfStack()) { + I32(v) => newval_lower = v; + I64(v) => newval_lower = v; + F32(v) => newval_lower = v; + F64(v) => newval_lower = v; + V128(low, high) => { + newval_lower = low; + newval_upper = high; + } + _ => ; + } + // apply write mirroring + if (writeMirror != null) { + var mirror_ptr = writeMirror.range_ol_64(address, size); + if (mirror_ptr.ok()) { + match (size) { + 1 => DataWriters.write_range_u8(mirror_ptr.result, u8.view(newval_lower)); + 2 => DataWriters.write_range_u16(mirror_ptr.result, u16.view(newval_lower)); + 4 => DataWriters.write_range_u32(mirror_ptr.result, u32.view(newval_lower)); + 8 => DataWriters.write_range_u64(mirror_ptr.result, newval_lower); + 16 => DataWriters.write_range_u128(mirror_ptr.result, (newval_lower, newval_upper)); + } + } + } + fire_probe(address, newval_lower, newval_upper); + return Resumption.Continue; + } + def fire_probe(addr: u64, newval_lower: u64, newval_upper: u64) {} +} \ No newline at end of file From f1bda15a80abdef44960bf591a16a43af517131e Mon Sep 17 00:00:00 2001 From: Tom An Date: Tue, 10 Dec 2024 20:07:00 -0500 Subject: [PATCH 14/18] refactored cache-sim monitor for benchmark measurement --- src/engine/compiler/SinglePassCompiler.v3 | 73 +++--- src/monitors/CacheSimulationMonitor.v3 | 292 ++++++++++++++++++++++ 2 files changed, 320 insertions(+), 45 deletions(-) create mode 100644 src/monitors/CacheSimulationMonitor.v3 diff --git a/src/engine/compiler/SinglePassCompiler.v3 b/src/engine/compiler/SinglePassCompiler.v3 index 3f8498f2..5df9336b 100644 --- a/src/engine/compiler/SinglePassCompiler.v3 +++ b/src/engine/compiler/SinglePassCompiler.v3 @@ -1525,10 +1525,6 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl if (intrinsified_read_probe != null) { // spill everything state.emitSaveAll(resolver, probeSpillMode); - // compute VSP for potential frame access - emit_compute_vsp(regs.vsp, state.sp); - emit_spill_vsp(regs.vsp); - masm.emit_store_curstack_vsp(regs.vsp); // load RT args (addr, val) and call RT if (index_reg.index == 0) { // fixed addr @@ -1540,7 +1536,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl var arg1 = masm.getV3ParamReg(ValueKind.REF, 1); var arg2 = masm.getV3ParamReg(ValueKind.REF, 2); masm.emit_mov_r_r(ValueKind.REF, arg1, index_reg); - masm.emit_mov_r_m(kind, arg2, masm.slotAddr(state.sp - 1)); + masm.emit_mov_r_m(ValueKind.REF, arg2, masm.slotAddr(state.sp - 1)); masm.emit_call_MemoryReadProbe_fire(intrinsified_read_probe); emit_reload_regs(); if (!probeSpillMode.free_regs) state.emitRestoreAll(resolver); @@ -1571,48 +1567,35 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl index_reg = popReg().reg; } var mirror_base: Reg; - if (intrinsified_write_probe != null) { - if (intrinsified_write_probe.has_fire_probe) { - // temporarily push values back in order to preserve the register values across RT call - if (index_reg.index != 0) - state.push(KIND_I32 | IN_REG, index_reg, 0); - state.push((byte.view(sv.kind().tag) << 4) | IN_REG, sv.reg, 0); - - // spill everything - state.emitSaveAll(resolver, probeSpillMode); - // compute VSP for potential frame access - emit_compute_vsp(regs.vsp, state.sp); - emit_spill_vsp(regs.vsp); - masm.emit_store_curstack_vsp(regs.vsp); - - // load RT args (addr, val) and call RT - if (index_reg.index == 0) { // fixed addr - index_reg = allocTmp(ValueKind.I32); - masm.emit_mov_r_i(index_reg, i32.!(offset)); - } else { - masm.emit_addw_r_i(index_reg, i32.!(offset)); - } - var arg1 = masm.getV3ParamReg(ValueKind.REF, 1); - var arg2 = masm.getV3ParamReg(ValueKind.REF, 2); - masm.emit_mov_r_r(ValueKind.REF, arg1, index_reg); - masm.emit_mov_r_m(kind, arg2, masm.slotAddr(state.sp - 1)); - masm.emit_call_MemoryWriteProbe_fire(intrinsified_write_probe); - emit_reload_regs(); - if (!probeSpillMode.free_regs) state.emitRestoreAll(resolver); - // restore abstract state - if (index_reg.index != 0) pop(); - pop(); - } - - if (intrinsified_write_probe.writeMirror != null) { // setup write mirror base - var mirror_mem_addr = Pointer.atObject(intrinsified_write_probe.writeMirror) - Pointer.NULL; - mirror_base = allocTmp(ValueKind.REF); - masm.emit_mov_r_l(mirror_base, mirror_mem_addr); - masm.emit_read_v3_mem_base(mirror_base, mirror_base); - } - intrinsified_write_probe = null; + if (intrinsified_write_probe != null && intrinsified_write_probe.writeMirror != null) { // setup write mirror base + var mirror_mem_addr = Pointer.atObject(intrinsified_write_probe.writeMirror) - Pointer.NULL; + mirror_base = allocTmp(ValueKind.REF); + masm.emit_mov_r_l(mirror_base, mirror_mem_addr); + masm.emit_read_v3_mem_base(mirror_base, mirror_base); } meth(kind, sv.reg, base_reg, mirror_base, index_reg, u32.!(offset)); // TODO: memory64 + if (intrinsified_write_probe != null && intrinsified_write_probe.has_fire_probe) { + // temporarily push values back in order to preserve the register values across RT call + state.push((byte.view(sv.kind().tag) << 4) | IN_REG, sv.reg, 0); + // spill everything + state.emitSaveAll(resolver, probeSpillMode); + // load RT args (addr, val) and call RT + if (index_reg.index == 0) { // fixed addr + index_reg = allocTmp(ValueKind.I32); + masm.emit_mov_r_i(index_reg, i32.!(offset)); + } else { + masm.emit_addw_r_i(index_reg, i32.!(offset)); + } + var arg1 = masm.getV3ParamReg(ValueKind.REF, 1); + var arg2 = masm.getV3ParamReg(ValueKind.REF, 2); + masm.emit_mov_r_r(ValueKind.REF, arg1, index_reg); + masm.emit_mov_r_m(ValueKind.REF, arg2, masm.slotAddr(state.sp - 1)); + masm.emit_call_MemoryWriteProbe_fire(intrinsified_write_probe); + emit_reload_regs(); + if (!probeSpillMode.free_regs) state.emitRestoreAll(resolver); + pop(); + } + intrinsified_write_probe = null; } def emitIntrinsifiedMemoryReadProbe() { diff --git a/src/monitors/CacheSimulationMonitor.v3 b/src/monitors/CacheSimulationMonitor.v3 new file mode 100644 index 00000000..e8abf916 --- /dev/null +++ b/src/monitors/CacheSimulationMonitor.v3 @@ -0,0 +1,292 @@ +// Copyright 2024 Wizard Authors. All rights reserved. +// See LICENSE for details of Apache 2.0 license. + +def monitor_ = MonitorRegistry.add( + "cache-sim", "Simulates a cache for a simple memory model.", + CacheSimulationMonitor.new()); + + +class CacheSimulationMonitor extends Monitor { + def cache = CacheInstance.new(4, 7, 11, 14); + def block_size_bits: u7 = 7; + def bi = BytecodeIterator.new(); + var cache_stats: Array>; + var mod: Module; + + def onParse(module: Module, err: ErrorGen) { + mod = module; + cache_stats = Array>.new(module.functions.length); + CacheSimBytecodeInstrumenter.new(beforeMemAccess, module).run(); + } + + def onFinish(i: Instance, r: Result) { + // disabled printing for performance analysis + //for (i < cache_stats.length) { + // if (cache_stats[i] != null) { + // var decl = mod.functions[i]; + // output_func_header(mod, decl); + // var unordered_stats = Vector<(int, CacheStat)>.new(); + // cache_stats[i].apply(processStats(_, _, unordered_stats)); + // var len = unordered_stats.length; + // var ordered_stats = Arrays.sort(unordered_stats.extract(), 0, len, orderStats); + // for (stat in ordered_stats) { + // var pc = stat.0; + // var name = bi.reset(decl).at(pc).current().mnemonic; + // output_cache_stat(stat.0, name, stat.1); + // } + // } + //} + } + + private def processStats(pc: int, stats: CacheStat, accum: Vector<(int, CacheStat)>) { + accum.put((pc, stats)); + } + private def orderStats(a: (int, CacheStat), b: (int, CacheStat)) -> bool { + return a.0 < b.0; + } + + private def beforeMemAccess(pc: int, func_id: int, address: u64, size: u64) { + if (cache_stats[func_id] == null) { + cache_stats[func_id] = HashMap.new(int.!, int.==); + } + var old_stats = cache_stats[func_id][pc]; + var hits = old_stats.hits; + var misses = old_stats.misses; + var addr_start_index = address >> block_size_bits; + var addr_end_index = (address + size - 1) >> block_size_bits; + if (LRUResult.Hit.?(cache.access(u32.view(addr_start_index << block_size_bits)))) { + hits += 1; + } else { + misses += 1; + } + if (addr_start_index != addr_end_index) { + if (LRUResult.Hit.?(cache.access(u32.view(addr_end_index << block_size_bits)))) { + hits += 1; + } else { + misses += 1; + } + } + cache_stats[func_id][pc] = CacheStat(hits, misses); + } +} + +private class CacheSimBytecodeInstrumenter extends BytecodeInstrumenter { + def var callback: (int, int, u64, u64) -> (); + + new(callback: (int, int, u64, u64) -> (), module: Module) super(module) { + this.callback = callback; + } + + def visitLoad(op: Opcode, imm: MemArg, size: u8) { + var bi = this.bi; + insertProbeHere(CacheReadProbe.new(callback(bi.pc, bi.func.func_index, _, u64.view(size)), imm, size)); + } + def visitStore(op: Opcode, imm: MemArg, size: u8) { + var bi = this.bi; + insertProbeHere(CacheWriteProbe.new(callback(bi.pc, bi.func.func_index, _, u64.view(size)), imm, size)); + } +} + +private class CacheWriteProbe extends MemoryWriteProbe { + def var callback: (u64) -> (); + + new(callback: (u64) -> (), imm: MemArg, size: u8) { + this.imm = imm; + this.size = size; + this.callback = callback; + this.has_fire_probe = true; + } + + def fire_probe(addr: u64, val: u64, unused_val: u64) { + this.callback(addr); + } +} + +private class CacheReadProbe extends MemoryReadProbe { + def var callback: (u64) -> (); + + new(callback: (u64) -> (), imm: MemArg, size: u8) { + this.imm = imm; + this.size = size; + this.callback = callback; + } + + def fire_probe(addr: u64, val: u64, unused_val: u64) { + this.callback(addr); + } +} + +/***** OUTPUT FORMATTING HELPER METHODS (Do not modify) *****/ +/* + * ***Output Format for Grading:*** + * + * 1. **Function Header Output:** + * - Use the helper method `output_func_header()` to print each function's header. + * - The **function headers must be printed in increasing order of function index** + * (e.g., Func[0], Func[1], Func[2], ...). + * - **Ensure** that the function header is printed **before** the corresponding cache statistics. + * + * 2. **CacheStat Output:** + * - After printing each function header, use the helper method `output_cache_stat()` + * to print the cache statistics. + * - The **cache statistics must be printed in increasing order of the program counter (pc)** + * for each function. + * + * ***Important Notes:*** + * - The **correct order of output** is essential for grading. + * - Output **must follow** the specified structure of increasing function index and program counter. + * - Do not report unvisited functions or accesses> + * - Do not invoke any output functions other than the methods specified above + * - **Double-check your output** to ensure it matches the expected format, as incorrect output will result in lost points. + */ + +type CacheStat(hits: u64, misses: u64); + +// Method to output function name +def output_func_header(module: Module, func: FuncDecl) { + Trace.OUT.beginColor(Color.FUNC) + .put1("func %q:", func.render(module.names, _)) + .endColors().ln(); + +} +// Method to output a single CacheStat +def output_cache_stat(pc: int, op_mnemonic: string, stat: CacheStat) { + Trace.OUT.beginColor(Color.INACTIVE) + .mark() + .put1("+%d", pc) + .rjustify_mark(5) + .endColors() + .putc(' '); + + Trace.OUT.beginColor(Color.SOURCE) + .mark() + .puts(op_mnemonic) + .rjustify_mark(16) + .putc(':') + .putc(' ') + .endColors(); + + + Trace.OUT.beginColor(Color.SUCCESS) + .mark() + .put1("%d", stat.hits) + .rjustify_mark(6) + .endColors() + .putc(' '); + Trace.OUT.beginColor(Color.HIGH) + .mark() + .put1("%d", stat.misses) + .rjustify_mark(6) + .endColors() + .putc(' ') + .ln(); +} +/******************/ + + +type TagElement(valid: bool, tag: u14, age: u64); + +type LRUResult { + case Hit(t: u14); + case Miss(t: u14, is_evict: bool, evict: u14); +} + + +class TagStoreEntry(assoc: int) { + def elems = Array.new(assoc); + + def is_elem_valid(t: TagElement) -> bool { + return t.valid; + } + + def access_update(tag: u14) -> LRUResult { + var invalid_idx = -1; + var hit_idx = -1; + var rep_idx = -1; + var is_hit = false; + var has_invalid = false; + var max_age: u64 = 0; + + for (i < assoc) { + if (elems[i].valid) { + if (elems[i].tag == tag) { + // Check for Hit; reset age + is_hit = true; + hit_idx = i; + } + else { + if (!has_invalid && (elems[i].age >= max_age)) { + // Track oldest element for eviction as long as + // there is no invalid element to replace + max_age = elems[i].age; + rep_idx = i; + } + } + // Increment age of all valid elements + elems[i] = TagElement(elems[i].valid, elems[i].tag, elems[i].age + 1); + } else { + // There exists an invalid element to fill for miss + has_invalid = true; + invalid_idx = i; + } + } + + if (is_hit) { + elems[hit_idx] = TagElement(true, tag, 0); + return LRUResult.Hit(tag); + } + + if (has_invalid) { + elems[invalid_idx] = TagElement(true, tag, 0); + return LRUResult.Miss(tag, false, 0); + } + + // All elements are valid and no hit; evict with LRU + var evict_tag = elems[rep_idx].tag; + elems[rep_idx] = TagElement(true, tag, 0); + return LRUResult.Miss(tag, true, evict_tag); + } +} + +class CacheInstance { + var tag_store: Array; + + var assoc: int; + var tag_bits: byte; + var block_bits: byte; + var index_bits: byte; + + var block_size: int; + var index_size: int; + + new (assoc: int, block_bits: byte, index_bits: byte, tag_bits: byte) { + this.tag_bits = tag_bits; + this.block_bits = block_bits; + this.index_bits = index_bits; + + if ((tag_bits + index_bits + block_bits) != 32) { + System.puts("Cache parameters do not add up to 32 bits.\n"); + } + + this.assoc = assoc; + this.index_size = (1 << index_bits); + this.block_size = (1 << block_bits); + + tag_store = Array.new(this.index_size); + for (i < this.index_size) { + tag_store[i] = TagStoreEntry.new(assoc); + } + } + + def access(addr: u32) -> LRUResult { + var offset = u7.!(addr & ((1u << this.block_bits) - 1)); + addr >>= this.block_bits; + var index = u11.!(addr & ((1u << this.index_bits) - 1)); + addr >>= this.index_bits; + var tag_val = u14.!(addr & ((1u << this.tag_bits) - 1)); + + var lru_result = tag_store[index].access_update(tag_val); + return lru_result; + } + +} \ No newline at end of file From 5bbeafdd02da0413a008ffff285bdc59d412a0f2 Mon Sep 17 00:00:00 2001 From: Tom An Date: Tue, 10 Dec 2024 23:56:48 -0500 Subject: [PATCH 15/18] update example monitor for evaluation --- src/monitors/CacheSimulationMonitor.v3 | 292 ------------------------- src/monitors/MemAccessMonitor.v3 | 71 ++++++ 2 files changed, 71 insertions(+), 292 deletions(-) delete mode 100644 src/monitors/CacheSimulationMonitor.v3 create mode 100644 src/monitors/MemAccessMonitor.v3 diff --git a/src/monitors/CacheSimulationMonitor.v3 b/src/monitors/CacheSimulationMonitor.v3 deleted file mode 100644 index e8abf916..00000000 --- a/src/monitors/CacheSimulationMonitor.v3 +++ /dev/null @@ -1,292 +0,0 @@ -// Copyright 2024 Wizard Authors. All rights reserved. -// See LICENSE for details of Apache 2.0 license. - -def monitor_ = MonitorRegistry.add( - "cache-sim", "Simulates a cache for a simple memory model.", - CacheSimulationMonitor.new()); - - -class CacheSimulationMonitor extends Monitor { - def cache = CacheInstance.new(4, 7, 11, 14); - def block_size_bits: u7 = 7; - def bi = BytecodeIterator.new(); - var cache_stats: Array>; - var mod: Module; - - def onParse(module: Module, err: ErrorGen) { - mod = module; - cache_stats = Array>.new(module.functions.length); - CacheSimBytecodeInstrumenter.new(beforeMemAccess, module).run(); - } - - def onFinish(i: Instance, r: Result) { - // disabled printing for performance analysis - //for (i < cache_stats.length) { - // if (cache_stats[i] != null) { - // var decl = mod.functions[i]; - // output_func_header(mod, decl); - // var unordered_stats = Vector<(int, CacheStat)>.new(); - // cache_stats[i].apply(processStats(_, _, unordered_stats)); - // var len = unordered_stats.length; - // var ordered_stats = Arrays.sort(unordered_stats.extract(), 0, len, orderStats); - // for (stat in ordered_stats) { - // var pc = stat.0; - // var name = bi.reset(decl).at(pc).current().mnemonic; - // output_cache_stat(stat.0, name, stat.1); - // } - // } - //} - } - - private def processStats(pc: int, stats: CacheStat, accum: Vector<(int, CacheStat)>) { - accum.put((pc, stats)); - } - private def orderStats(a: (int, CacheStat), b: (int, CacheStat)) -> bool { - return a.0 < b.0; - } - - private def beforeMemAccess(pc: int, func_id: int, address: u64, size: u64) { - if (cache_stats[func_id] == null) { - cache_stats[func_id] = HashMap.new(int.!, int.==); - } - var old_stats = cache_stats[func_id][pc]; - var hits = old_stats.hits; - var misses = old_stats.misses; - var addr_start_index = address >> block_size_bits; - var addr_end_index = (address + size - 1) >> block_size_bits; - if (LRUResult.Hit.?(cache.access(u32.view(addr_start_index << block_size_bits)))) { - hits += 1; - } else { - misses += 1; - } - if (addr_start_index != addr_end_index) { - if (LRUResult.Hit.?(cache.access(u32.view(addr_end_index << block_size_bits)))) { - hits += 1; - } else { - misses += 1; - } - } - cache_stats[func_id][pc] = CacheStat(hits, misses); - } -} - -private class CacheSimBytecodeInstrumenter extends BytecodeInstrumenter { - def var callback: (int, int, u64, u64) -> (); - - new(callback: (int, int, u64, u64) -> (), module: Module) super(module) { - this.callback = callback; - } - - def visitLoad(op: Opcode, imm: MemArg, size: u8) { - var bi = this.bi; - insertProbeHere(CacheReadProbe.new(callback(bi.pc, bi.func.func_index, _, u64.view(size)), imm, size)); - } - def visitStore(op: Opcode, imm: MemArg, size: u8) { - var bi = this.bi; - insertProbeHere(CacheWriteProbe.new(callback(bi.pc, bi.func.func_index, _, u64.view(size)), imm, size)); - } -} - -private class CacheWriteProbe extends MemoryWriteProbe { - def var callback: (u64) -> (); - - new(callback: (u64) -> (), imm: MemArg, size: u8) { - this.imm = imm; - this.size = size; - this.callback = callback; - this.has_fire_probe = true; - } - - def fire_probe(addr: u64, val: u64, unused_val: u64) { - this.callback(addr); - } -} - -private class CacheReadProbe extends MemoryReadProbe { - def var callback: (u64) -> (); - - new(callback: (u64) -> (), imm: MemArg, size: u8) { - this.imm = imm; - this.size = size; - this.callback = callback; - } - - def fire_probe(addr: u64, val: u64, unused_val: u64) { - this.callback(addr); - } -} - -/***** OUTPUT FORMATTING HELPER METHODS (Do not modify) *****/ -/* - * ***Output Format for Grading:*** - * - * 1. **Function Header Output:** - * - Use the helper method `output_func_header()` to print each function's header. - * - The **function headers must be printed in increasing order of function index** - * (e.g., Func[0], Func[1], Func[2], ...). - * - **Ensure** that the function header is printed **before** the corresponding cache statistics. - * - * 2. **CacheStat Output:** - * - After printing each function header, use the helper method `output_cache_stat()` - * to print the cache statistics. - * - The **cache statistics must be printed in increasing order of the program counter (pc)** - * for each function. - * - * ***Important Notes:*** - * - The **correct order of output** is essential for grading. - * - Output **must follow** the specified structure of increasing function index and program counter. - * - Do not report unvisited functions or accesses> - * - Do not invoke any output functions other than the methods specified above - * - **Double-check your output** to ensure it matches the expected format, as incorrect output will result in lost points. - */ - -type CacheStat(hits: u64, misses: u64); - -// Method to output function name -def output_func_header(module: Module, func: FuncDecl) { - Trace.OUT.beginColor(Color.FUNC) - .put1("func %q:", func.render(module.names, _)) - .endColors().ln(); - -} -// Method to output a single CacheStat -def output_cache_stat(pc: int, op_mnemonic: string, stat: CacheStat) { - Trace.OUT.beginColor(Color.INACTIVE) - .mark() - .put1("+%d", pc) - .rjustify_mark(5) - .endColors() - .putc(' '); - - Trace.OUT.beginColor(Color.SOURCE) - .mark() - .puts(op_mnemonic) - .rjustify_mark(16) - .putc(':') - .putc(' ') - .endColors(); - - - Trace.OUT.beginColor(Color.SUCCESS) - .mark() - .put1("%d", stat.hits) - .rjustify_mark(6) - .endColors() - .putc(' '); - Trace.OUT.beginColor(Color.HIGH) - .mark() - .put1("%d", stat.misses) - .rjustify_mark(6) - .endColors() - .putc(' ') - .ln(); -} -/******************/ - - -type TagElement(valid: bool, tag: u14, age: u64); - -type LRUResult { - case Hit(t: u14); - case Miss(t: u14, is_evict: bool, evict: u14); -} - - -class TagStoreEntry(assoc: int) { - def elems = Array.new(assoc); - - def is_elem_valid(t: TagElement) -> bool { - return t.valid; - } - - def access_update(tag: u14) -> LRUResult { - var invalid_idx = -1; - var hit_idx = -1; - var rep_idx = -1; - var is_hit = false; - var has_invalid = false; - var max_age: u64 = 0; - - for (i < assoc) { - if (elems[i].valid) { - if (elems[i].tag == tag) { - // Check for Hit; reset age - is_hit = true; - hit_idx = i; - } - else { - if (!has_invalid && (elems[i].age >= max_age)) { - // Track oldest element for eviction as long as - // there is no invalid element to replace - max_age = elems[i].age; - rep_idx = i; - } - } - // Increment age of all valid elements - elems[i] = TagElement(elems[i].valid, elems[i].tag, elems[i].age + 1); - } else { - // There exists an invalid element to fill for miss - has_invalid = true; - invalid_idx = i; - } - } - - if (is_hit) { - elems[hit_idx] = TagElement(true, tag, 0); - return LRUResult.Hit(tag); - } - - if (has_invalid) { - elems[invalid_idx] = TagElement(true, tag, 0); - return LRUResult.Miss(tag, false, 0); - } - - // All elements are valid and no hit; evict with LRU - var evict_tag = elems[rep_idx].tag; - elems[rep_idx] = TagElement(true, tag, 0); - return LRUResult.Miss(tag, true, evict_tag); - } -} - -class CacheInstance { - var tag_store: Array; - - var assoc: int; - var tag_bits: byte; - var block_bits: byte; - var index_bits: byte; - - var block_size: int; - var index_size: int; - - new (assoc: int, block_bits: byte, index_bits: byte, tag_bits: byte) { - this.tag_bits = tag_bits; - this.block_bits = block_bits; - this.index_bits = index_bits; - - if ((tag_bits + index_bits + block_bits) != 32) { - System.puts("Cache parameters do not add up to 32 bits.\n"); - } - - this.assoc = assoc; - this.index_size = (1 << index_bits); - this.block_size = (1 << block_bits); - - tag_store = Array.new(this.index_size); - for (i < this.index_size) { - tag_store[i] = TagStoreEntry.new(assoc); - } - } - - def access(addr: u32) -> LRUResult { - var offset = u7.!(addr & ((1u << this.block_bits) - 1)); - addr >>= this.block_bits; - var index = u11.!(addr & ((1u << this.index_bits) - 1)); - addr >>= this.index_bits; - var tag_val = u14.!(addr & ((1u << this.tag_bits) - 1)); - - var lru_result = tag_store[index].access_update(tag_val); - return lru_result; - } - -} \ No newline at end of file diff --git a/src/monitors/MemAccessMonitor.v3 b/src/monitors/MemAccessMonitor.v3 new file mode 100644 index 00000000..a2b38a02 --- /dev/null +++ b/src/monitors/MemAccessMonitor.v3 @@ -0,0 +1,71 @@ +// Copyright 2024 Wizard Authors. All rights reserved. +// See LICENSE for details of Apache 2.0 license. + +def monitor_ = MonitorRegistry.add( + "mem-access", "tracks memory access pattern.", + MemAccessMonitor.new()); + + +class MemAccessMonitor extends Monitor { + def bi = BytecodeIterator.new(); + var block_access: HashMap; + var mod: Module; + + def onParse(module: Module, err: ErrorGen) { + mod = module; + block_access = HashMap.new(int.!, int.==); + MemAccessBytecodeInstrumenter.new(block_access, module).run(); + } +} + +private class MemAccessBytecodeInstrumenter extends BytecodeInstrumenter { + def var block_access: HashMap; + + new(block_access: HashMap, module: Module) super(module) { + this.block_access = block_access; + } + + def visitLoad(op: Opcode, imm: MemArg, size: u8) { + if (op == Opcode.I32_LOAD || op == Opcode.I64_LOAD) { + var bi = this.bi; + insertProbeHere(AccessReadProbe.new(this.block_access, imm, size)); + } + } + def visitStore(op: Opcode, imm: MemArg, size: u8) { + if (op == Opcode.I32_STORE || op == Opcode.I64_STORE) { + var bi = this.bi; + insertProbeHere(AccessWriteProbe.new(this.block_access, imm, size)); + } + } +} + +private class AccessWriteProbe extends MemoryWriteProbe { + def var block_access: HashMap; + + new(block_access: HashMap, imm: MemArg, size: u8) { + this.imm = imm; + this.size = size; + this.block_access = block_access; + this.has_fire_probe = true; + } + + def fire_probe(addr: u64, val: u64, unused_val: u64) { + var block = int.!(addr / 10000); + block_access[block]++; + } +} + +private class AccessReadProbe extends MemoryReadProbe { + def var block_access: HashMap; + + new(block_access: HashMap, imm: MemArg, size: u8) { + this.imm = imm; + this.size = size; + this.block_access = block_access; + } + + def fire_probe(addr: u64, val: u64, unused_val: u64) { + var block = int.!(addr / 10000); + block_access[block]++; + } +} From 2a9999ef4bb621a54ba046485b835f4e39ca8b2c Mon Sep 17 00:00:00 2001 From: Tom An Date: Wed, 11 Dec 2024 21:08:11 -0500 Subject: [PATCH 16/18] major bugfix --- src/engine/Engine.v3 | 2 +- src/engine/Value.v3 | 28 +++++++++++++++++++ .../x86-64/X86_64WhammProbeTrampoline.v3 | 11 +++++++- src/util/Whamm.v3 | 19 +++++++++++++ 4 files changed, 58 insertions(+), 2 deletions(-) diff --git a/src/engine/Engine.v3 b/src/engine/Engine.v3 index 0f643d25..9248b59c 100644 --- a/src/engine/Engine.v3 +++ b/src/engine/Engine.v3 @@ -25,7 +25,7 @@ class Engine { if (data == null) return FileLoadResult.FileNotFound(path); var limits = Limits.new().set(extensions); var bp = BinParser.new(extensions, limits, path); - bp.tiering = tiering_override; + bp.tiering = if(tiering_override != null, tiering_override, Execute.tiering); var r = bp.push(data, 0, data.length).finish(); match (r) { Ok(module) => diff --git a/src/engine/Value.v3 b/src/engine/Value.v3 index db36823b..88bf85b5 100644 --- a/src/engine/Value.v3 +++ b/src/engine/Value.v3 @@ -10,6 +10,34 @@ type Value { case F32(bits: u32); case F64(bits: u64); case V128(low: u64, high: u64); + + def equal(that: Value) -> bool { + if (this == that) return true; + if (Value.Ref.?(this) == Value.Ref.?(that)) { + return Value.Ref.!(this).val == Value.Ref.!(that).val; + } + if (Value.I31.?(this) == Value.I31.?(that)) { + return Value.I31.!(this).val == Value.I31.!(that).val; + } + if (Value.I32.?(this) == Value.I32.?(that)) { + return Value.I32.!(this).val == Value.I32.!(that).val; + } + if (Value.I64.?(this) == Value.I64.?(that)) { + return Value.I64.!(this).val == Value.I64.!(that).val; + } + if (Value.F32.?(this) == Value.F32.?(that)) { + return Value.F32.!(this).bits == Value.F32.!(that).bits; + } + if (Value.F64.?(this) == Value.F64.?(that)) { + return Value.F64.!(this).bits == Value.F64.!(that).bits; + } + if (Value.V128.?(this) == Value.V128.?(that)) { + var a = Value.V128.!(this); + var b = Value.V128.!(that); + return a.low == b.low && a.high == b.high; + } + return false; + } } // Categorization of values into storage kinds. diff --git a/src/engine/x86-64/X86_64WhammProbeTrampoline.v3 b/src/engine/x86-64/X86_64WhammProbeTrampoline.v3 index e98c372a..e921cae3 100644 --- a/src/engine/x86-64/X86_64WhammProbeTrampoline.v3 +++ b/src/engine/x86-64/X86_64WhammProbeTrampoline.v3 @@ -79,11 +79,20 @@ component X86_64WhammTrampoline { def getEntry(sig: Array) -> Pointer { for (entry = trampoline_entries; entry != null; entry = entry.tail) { - if (Arrays.equal(entry.head.0, sig)) return entry.head.1; + if (sigArraysMatch(entry.head.0, sig)) { + return entry.head.1; + } } return Pointer.NULL; } +def sigArraysMatch(x: Array, y: Array) -> bool { + if (x == y) return true; + if (x.length != y.length) return false; + for (i < x.length) if (!x[i].equal(y[i])) return false; + return true; +} + def allocateCodeForTrampoline() { // Allocate 10 pages to ensure that we have enough space for all trampoline code. var code_size = PAGE_SIZE_i * 10; diff --git a/src/util/Whamm.v3 b/src/util/Whamm.v3 index 17d17079..9600a8f3 100644 --- a/src/util/Whamm.v3 +++ b/src/util/Whamm.v3 @@ -41,6 +41,25 @@ type WhammArg { case Val(v: Value); case Operand(t: ValueType, i: int); case Local(t: ValueType, i: int); + + def equal(that: WhammArg) -> bool { + if (this == that) return true; + if (WhammArg.FrameAccessor.?(this) == WhammArg.FrameAccessor.?(that)) return true; + if (WhammArg.Val.?(this) == WhammArg.Val.?(that)) { + return WhammArg.Val.!(this).v.equal(WhammArg.Val.!(that).v); + } + if (WhammArg.Operand.?(this) == WhammArg.Operand.?(that)) { + var a = WhammArg.Operand.!(this); + var b = WhammArg.Operand.!(that); + return ValueTypes.kind(a.t).code == ValueTypes.kind(b.t).code && a.i == b.i; + } + if (WhammArg.Local.?(this) == WhammArg.Local.?(that)) { + var a = WhammArg.Local.!(this); + var b = WhammArg.Local.!(that); + return ValueTypes.kind(a.t).code == ValueTypes.kind(b.t).code && a.i == b.i; + } + return false; + } } class WhammPredicate(call: WhammParam.Call) { } From 5974a34f2a11a0a546ecf6044a25383316ebb40f Mon Sep 17 00:00:00 2001 From: Tom An Date: Thu, 12 Dec 2024 02:05:17 -0500 Subject: [PATCH 17/18] add temp inline count --- src/engine/compiler/SinglePassCompiler.v3 | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/engine/compiler/SinglePassCompiler.v3 b/src/engine/compiler/SinglePassCompiler.v3 index 5df9336b..1143928e 100644 --- a/src/engine/compiler/SinglePassCompiler.v3 +++ b/src/engine/compiler/SinglePassCompiler.v3 @@ -114,6 +114,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl var intrinsified_read_probe: MemoryReadProbe = null; var intrinsified_write_probe: MemoryWriteProbe = null; + var inline_count = 0; new() { masm.unimplemented = unsupported; @@ -409,7 +410,8 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl var orig_sp = state.sp; var callee_func = WasmFunction.!(probe.func); - if (SpcTuning.inlineSmallFunc) { + if (SpcTuning.inlineSmallFunc && inline_count < 1000) { + inline_count++; // TODO: can reuse when implementing inlining for SPC inline_config = InlineConfig(probe.spc_swap_membase, probe.spc_swap_instance, probe.spc_inline_func); if (!probe.inline_heuristic_checked) { From 6c09c42df2df3d2e3fd8031108d8c85ac619477d Mon Sep 17 00:00:00 2001 From: Tom An Date: Thu, 12 Dec 2024 02:08:16 -0500 Subject: [PATCH 18/18] add temp inline count --- src/engine/compiler/SinglePassCompiler.v3 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/engine/compiler/SinglePassCompiler.v3 b/src/engine/compiler/SinglePassCompiler.v3 index 1143928e..79da7aee 100644 --- a/src/engine/compiler/SinglePassCompiler.v3 +++ b/src/engine/compiler/SinglePassCompiler.v3 @@ -410,7 +410,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl var orig_sp = state.sp; var callee_func = WasmFunction.!(probe.func); - if (SpcTuning.inlineSmallFunc && inline_count < 1000) { + if (SpcTuning.inlineSmallFunc && inline_count < 100) { inline_count++; // TODO: can reuse when implementing inlining for SPC inline_config = InlineConfig(probe.spc_swap_membase, probe.spc_swap_instance, probe.spc_inline_func);