Skip to content

Commit

Permalink
pulley: Slightly optimize bounds checks (bytecodealliance#10080)
Browse files Browse the repository at this point in the history
* pulley: Slightly optimize bounds checks

In profiling a module I was noticing that the previous
`xbc32_bound_trap` instruction wasn't being used when I expected.
Investigation revealed that the load of the bound itself was GVN'd and
deduplicated (yay!) but it meant that the load was used in two locations
meaning it didn't pass checks for `sinkable_load`. This commit fixes
this by repurposing `xbc32_bound_trap` for "the bound is in a register"
and renaming the previous instruction to `xbc32_boundne_trap`. This
helps cut down on the number of opcodes in this benchmark and improves
performance slightly.

At the same time this tightens up "sinkable loads" to require native
endianness since that's what the bound of memory is stored as.
Additionally in addition to testing for `a < b` and optimizing that this
also now optimizes `b > a`, the same condition just having the arguments
swapped.

* Fix some copy/paste typos
  • Loading branch information
alexcrichton authored Jan 22, 2025
1 parent 2f27a10 commit 7d78789
Show file tree
Hide file tree
Showing 9 changed files with 329 additions and 43 deletions.
14 changes: 13 additions & 1 deletion cranelift/codegen/src/isa/pulley_shared/inst.isle
Original file line number Diff line number Diff line change
Expand Up @@ -178,14 +178,26 @@
(extern constructor endianness endianness)
(type Endianness extern (enum Little Big))

(decl pure is_native_endianness (Endianness) bool)
(extern constructor is_native_endianness is_native_endianness)

;; Partial constructor and type representing a "sinkable load" which can be
;; moved into another instruction. Note that `SinkableLoad` should not be used
;; as-is and should instead be converted to a `SunkLoad`.
;;
;; To be a sinkable load the load must pass:
;;
;; * The `is_sinkable_inst` shared amongst backends test must be `true`
;; * The load must be in "native endianness"
;; * The static offset must fit in an unsigned 8-bit integer.
;;
;; If the last two requirements here are too restrictive then multiple helpers
;; might be needed in the future.
(type SinkableLoad (enum (Load (inst Inst) (ty Type) (addr Value) (offset u8))))
(decl pure partial sinkable_load (Value) SinkableLoad)
(rule (sinkable_load value @ (value_type ty))
(if-let inst @ (load flags addr (offset32 offset)) (is_sinkable_inst value))
(if-let (Endianness.Little) (endianness flags))
(if-let true (is_native_endianness (endianness flags)))
(if-let offset8 (u8_try_from_i32 offset))
(SinkableLoad.Load inst ty addr offset8))

Expand Down
40 changes: 29 additions & 11 deletions cranelift/codegen/src/isa/pulley_shared/lower.isle
Original file line number Diff line number Diff line change
Expand Up @@ -123,23 +123,41 @@
;; Each of these translates to a single "xbc" (x-register bounds check)
;; instruction.
;;
;; Note that there are two cases here, one for 32-bit hosts and one for 64-bit
;; hosts. They lower to the same `xbc32_bound_trap` instruction which has
;; different semantics on 32/64-bit but uses the 32-bit address as an argument
;; on both platforms.
;; Currently there's a 2x2 matrix here. One axis is 32-bit hosts and 64-bit
;; hosts while the other axis is `a < b` vs `a > b`. These all bottom out
;; in the `emit_xbc32` helper below.
(rule 1 (lower (trapnz (icmp (IntCC.UnsignedGreaterThan) a @ (value_type $I32) (isub b (u8_from_iconst size))) code))
(if-let (PointerWidth.PointerWidth32) (pointer_width))
(if-let load (sinkable_load b))
(side_effect (emit_xbc32 a load size code)))
(side_effect (emit_xbc32 a b size code)))

(rule 1 (lower (trapnz (icmp (IntCC.UnsignedLessThan) (isub b (u8_from_iconst size)) a @ (value_type $I32)) code))
(if-let (PointerWidth.PointerWidth32) (pointer_width))
(side_effect (emit_xbc32 a b size code)))

(rule 1 (lower (trapnz (icmp (IntCC.UnsignedGreaterThan) (uextend a @ (value_type $I32)) (isub b (u8_from_iconst size))) code))
(if-let (PointerWidth.PointerWidth64) (pointer_width))
(if-let load (sinkable_load b))
(side_effect (emit_xbc32 a load size code)))
(side_effect (emit_xbc32 a b size code)))

(decl emit_xbc32 (Value SunkLoad u8 TrapCode) SideEffectNoResult)
(rule (emit_xbc32 a (SunkLoad.Load _ bound_addr bound_off) size code)
(pulley_xbc32_bound_trap a bound_addr bound_off size code))
(rule 1 (lower (trapnz (icmp (IntCC.UnsignedLessThan) (isub b (u8_from_iconst size)) (uextend a @ (value_type $I32))) code))
(if-let (PointerWidth.PointerWidth64) (pointer_width))
(side_effect (emit_xbc32 a b size code)))

;; Helper to emit a bounds check which traps if the first value is greater than
;; the second value minus the immediate size provided here.
;;
;; This helper will see if the second value is a sinkable load in which case
;; it can fold the load directly into the "xbc" instruction. Otherwise a
;; simpler "xbc" instruction is used.
(decl emit_xbc32 (Value Value u8 TrapCode) SideEffectNoResult)
(rule 0 (emit_xbc32 a bound size code)
(pulley_xbc32_bound_trap a bound size code))
(rule 1 (emit_xbc32 a bound size code)
(if-let load (sinkable_load bound))
(emit_xbc32_sunk a load size code))

(decl emit_xbc32_sunk (Value SunkLoad u8 TrapCode) SideEffectNoResult)
(rule (emit_xbc32_sunk a (SunkLoad.Load _ bound_addr bound_off) size code)
(pulley_xbc32_boundne_trap a bound_addr bound_off size code))

;;;; Rules for `get_stack_pointer` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

Expand Down
4 changes: 4 additions & 0 deletions cranelift/codegen/src/isa/pulley_shared/lower/isle.rs
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,10 @@ where
flags.endianness(self.backend.isa_flags.endianness())
}

fn is_native_endianness(&mut self, endianness: &Endianness) -> bool {
*endianness == self.backend.isa_flags.endianness()
}

fn pointer_width(&mut self) -> PointerWidth {
P::pointer_width()
}
Expand Down
104 changes: 104 additions & 0 deletions cranelift/filetests/filetests/isa/pulley32/xbc.clif
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
test compile precise-output
target pulley32

function %simple(i32, i32) {
block0(v0: i32, v1: i32):
v2 = load.i32 v0+16
v3 = iconst.i32 24
v4 = isub v2, v3
v5 = icmp ugt v1, v4
trapnz v5, user1
return
}

; VCode:
; block0:
; xbc32_boundne_trap x1, x0, 16, 24 // trap=TrapCode(1)
; ret
;
; Disassembled:
; xbc32_boundne_trap x1, x0, 16, 24
; ret

function %swapped_args(i32, i32) {
block0(v0: i32, v1: i32):
v2 = load.i32 v0+16
v3 = iconst.i32 24
v4 = isub v2, v3
v5 = icmp ult v4, v1
trapnz v5, user1
return
}

; VCode:
; block0:
; xbc32_boundne_trap x1, x0, 16, 24 // trap=TrapCode(1)
; ret
;
; Disassembled:
; xbc32_boundne_trap x1, x0, 16, 24
; ret

function %twice(i32, i32, i32) {
block0(v0: i32, v1: i32, v2: i32):
;; load the bound & calculate what to check against
v3 = load.i32 v0+16
v4 = iconst.i32 24
v5 = isub v3, v4

;; check v1
v6 = icmp ugt v1, v5
trapnz v6, user1

;; check v2
v7 = icmp ugt v2, v5
trapnz v7, user1

return
}

; VCode:
; block0:
; x4 = xload32 x0+16 // flags =
; xbc32_bound_trap x1, x4, 24 // trap=TrapCode(1)
; xbc32_bound_trap x2, x4, 24 // trap=TrapCode(1)
; ret
;
; Disassembled:
; xload32le_offset8 x4, x0, 16
; xbc32_bound_trap x1, x4, 24
; xbc32_bound_trap x2, x4, 24
; ret


function %twice_swapped(i32, i32, i32) {
block0(v0: i32, v1: i32, v2: i32):
;; load the bound & calculate what to check against
v3 = load.i32 v0+16
v4 = iconst.i32 24
v5 = isub v3, v4

;; check v1
v6 = icmp ult v5, v1
trapnz v6, user1

;; check v2
v7 = icmp ult v5, v1
trapnz v7, user1

return
}

; VCode:
; block0:
; x3 = xload32 x0+16 // flags =
; xbc32_bound_trap x1, x3, 24 // trap=TrapCode(1)
; xbc32_bound_trap x1, x3, 24 // trap=TrapCode(1)
; ret
;
; Disassembled:
; xload32le_offset8 x3, x0, 16
; xbc32_bound_trap x1, x3, 24
; xbc32_bound_trap x1, x3, 24
; ret

114 changes: 114 additions & 0 deletions cranelift/filetests/filetests/isa/pulley64/xbc.clif
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
test compile precise-output
target pulley64

function %simple(i64, i32) {
block0(v0: i64, v1: i32):
v2 = load.i64 v0+16
v3 = uextend.i64 v1
v4 = iconst.i64 24
v5 = isub v2, v4
v6 = icmp ugt v3, v5
trapnz v6, user1
return
}

; VCode:
; block0:
; xbc32_boundne_trap x1, x0, 16, 24 // trap=TrapCode(1)
; ret
;
; Disassembled:
; xbc32_boundne_trap x1, x0, 16, 24
; ret

function %swapped_args(i64, i32) {
block0(v0: i64, v1: i32):
v2 = load.i64 v0+16
v3 = uextend.i64 v1
v4 = iconst.i64 24
v5 = isub v2, v4
v6 = icmp ult v5, v3
trapnz v6, user1
return
}

; VCode:
; block0:
; xbc32_boundne_trap x1, x0, 16, 24 // trap=TrapCode(1)
; ret
;
; Disassembled:
; xbc32_boundne_trap x1, x0, 16, 24
; ret

function %twice(i64, i32, i32) {
block0(v0: i64, v1: i32, v2: i32):
;; load the bound & calculate what to check against
v3 = load.i64 v0+16
v4 = iconst.i64 24
v5 = isub v3, v4

;; check v1
v6 = uextend.i64 v1
v7 = icmp ugt v6, v5
trapnz v7, user1

;; check v2
v8 = uextend.i64 v2
v9 = icmp ugt v8, v5
trapnz v9, user1

return
}

; VCode:
; block0:
; x4 = xload64 x0+16 // flags =
; xbc32_bound_trap x1, x4, 24 // trap=TrapCode(1)
; xbc32_bound_trap x2, x4, 24 // trap=TrapCode(1)
; ret
;
; Disassembled:
; xload64le_offset8 x4, x0, 16
; xbc32_bound_trap x1, x4, 24
; xbc32_bound_trap x2, x4, 24
; ret

function %twice_swapped(i64, i32, i32) {
block0(v0: i64, v1: i32, v2: i32):
;; load the bound & calculate what to check against
v3 = load.i64 v0+16
v4 = iconst.i64 24
v5 = isub v3, v4

;; check v1
v6 = uextend.i64 v1
v7 = icmp ult v5, v6
trapnz v7, user1

;; check v2
v8 = uextend.i64 v2
v9 = icmp ugt v5, v8
trapnz v9, user1

return
}

; VCode:
; block0:
; x7 = xload64 x0+16 // flags =
; xsub64_u8 x6, x7, 24
; xbc32_bound_trap x1, x7, 24 // trap=TrapCode(1)
; zext32 x7, x2
; trap_if_xult64 x7, x6 // code = TrapCode(1)
; ret
;
; Disassembled:
; xload64le_offset8 x7, x0, 16
; xsub64_u8 x6, x7, 24
; xbc32_bound_trap x1, x7, 24
; zext32 x7, x2
; br_if_xult64 x7, x6, 0x8 // target = 0x17
; ret
; trap

14 changes: 12 additions & 2 deletions pulley/src/interp.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2475,7 +2475,17 @@ impl OpVisitor for Interpreter<'_> {
ControlFlow::Continue(())
}

fn xbc32_bound_trap(
fn xbc32_bound_trap(&mut self, addr: XReg, bound: XReg, size: u8) -> ControlFlow<Done> {
let bound = self.state[bound].get_u64() as usize;
let addr = self.state[addr].get_u32() as usize;
if addr > bound.wrapping_sub(usize::from(size)) {
self.done_trap::<crate::XBc32BoundTrap>()
} else {
ControlFlow::Continue(())
}
}

fn xbc32_boundne_trap(
&mut self,
addr: XReg,
bound_ptr: XReg,
Expand All @@ -2485,7 +2495,7 @@ impl OpVisitor for Interpreter<'_> {
let bound = unsafe { self.load::<usize>(bound_ptr, bound_off.into()) };
let addr = self.state[addr].get_u32() as usize;
if addr > bound.wrapping_sub(usize::from(size)) {
self.done_trap::<crate::XBc32BoundTrap>()
self.done_trap::<crate::XBc32BoundNeTrap>()
} else {
ControlFlow::Continue(())
}
Expand Down
11 changes: 10 additions & 1 deletion pulley/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -617,8 +617,17 @@ macro_rules! for_each_op {
/// `dst = low32(cond) ? if_nonzero : if_zero`
xselect64 = XSelect64 { dst: XReg, cond: XReg, if_nonzero: XReg, if_zero: XReg };

/// `trapif(addr > *(bound_ptr + bound_off) - size)` (unsigned)
/// `trapif(addr > bound_ptr - size)` (unsigned)
xbc32_bound_trap = XBc32BoundTrap {
addr: XReg,
bound: XReg,
size: u8
};
/// `trapif(addr > *(bound_ptr + bound_off) - size)` (unsigned)
///
/// Note that the `bound_ptr + bound_off` load loads a
/// host-native-endian pointer-sized value.
xbc32_boundne_trap = XBc32BoundNeTrap {
addr: XReg,
bound_ptr: XReg,
bound_off: u8,
Expand Down
Loading

0 comments on commit 7d78789

Please sign in to comment.