Cranelift: fix load width on select-of-load.f64 by disallowing load f…

…usion. (#8113) We have an instruction lowering rule on x86-64 that allows a `select` operation to perform load fusion: when presented with any xmm-register-typed values (`f32`, `f64`, or `v128`), an argument to the select can become a load. Unfortunately, this lowering behavior is incorrect in the case of narrower-than-128-bit values: the cmove is converted into an if-else diamond with two 128-bit moves and so the load becomes a full 128-bit-width load. The fix is to disallow load fusion of selects of XMM-typed values. We could make the rules more fine-grained and keep `v128`-typed load fusion, but we're opting for the simpler and more conservative fix first here. Fixes #8112.
bytecodealliance · Mar 13, 2024 · 14fd410 · 14fd410
1 parent 18a481c
commit 14fd410
Show file tree

Hide file tree

Showing 2 changed files with 42 additions and 1 deletion.
diff --git a/cranelift/codegen/src/isa/x64/inst.isle b/cranelift/codegen/src/isa/x64/inst.isle
@@ -2880,7 +2880,14 @@
       (cmove ty cc consequent alternative))
 
 (rule (cmove_from_values (is_xmm_type ty) cc consequent alternative)
-      (cmove_xmm ty cc consequent alternative))
+      ;; force the `Value`s to `Xmm`s (in-register values) here:
+      ;; `cmove_xmm` can take an `XmmMemAligned`, but we don't want to
+      ;; allow load fusion here because `ty` may not be
+      ;; XMM-register-wide even if it is `is_xmm_type`. (E.g.,
+      ;; consider a cmove of `f64` values.)
+      (let ((cons Xmm consequent)
+            (alt Xmm alternative))
+        (cmove_xmm ty cc cons alt)))
 
 ;; Helper for creating `cmove` instructions with the logical OR of multiple
 ;; flags. Note that these instructions will always result in more than one

diff --git a/cranelift/filetests/filetests/isa/x64/xmm-select-load.clif b/cranelift/filetests/filetests/isa/x64/xmm-select-load.clif
@@ -0,0 +1,34 @@
+test compile precise-output
+target x86_64
+
+function %a(i32, f64, i64) -> f64 {
+block0(v0: i32, v1: f64, v2: i64):
+  v3 = load.f64 v2
+  v4 = select v0, v3, v1
+  return v4
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movsd   0(%rsi), %xmm5
+;   testl   %edi, %edi
+;   movsd %xmm0, %xmm0; jz $next; movsd %xmm5, %xmm0; $next:
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   movsd (%rsi), %xmm5 ; trap: heap_oob
+;   testl %edi, %edi
+;   je 0x14
+;   movsd %xmm5, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+