x64: Implement rotl/rotr for some small type combinations

bytecodealliance · Aug 11, 2022 · 32cda80 · 32cda80
1 parent 66e245d
commit 32cda80
Show file tree

Hide file tree

Showing 5 changed files with 234 additions and 272 deletions.
diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle
@@ -818,31 +818,17 @@
 
 ;;;; Rules for `rotl` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
-;; `i16` and `i8`: we need to extend the shift amount, or mask the
-;; constant.
+;; `i64` and smaller: we can rely on x86's rotate-amount masking since
+;;  we operate on the whole register. For const's we mask the constant.
 
-(rule (lower (has_type (ty_8_or_16 ty) (rotl src amt)))
-      (let ((amt_ Gpr (extend_to_gpr amt $I32 (ExtendKind.Zero))))
-        (x64_rotl ty src (gpr_to_imm8_gpr amt_))))
+(rule (lower (has_type (fits_in_64 ty) (rotl src amt)))
+        (x64_rotl ty src (put_masked_in_imm8_gpr amt ty)))
 
-(rule (lower (has_type (ty_8_or_16 ty)
+(rule (lower (has_type (fits_in_64 ty)
                        (rotl src (u64_from_iconst amt))))
       (x64_rotl ty src
                 (const_to_type_masked_imm8 amt ty)))
 
-;; `i64` and `i32`: we can rely on x86's rotate-amount masking since
-;;  we operate on the whole register.
-
-(rule (lower (has_type (ty_32_or_64 ty) (rotl src amt)))
-      ;; NB: Only the low bits of `amt` matter since we logically mask the
-      ;; shift amount to the value's bit width.
-      (let ((amt_ Gpr (lo_gpr amt)))
-        (x64_rotl ty src amt_)))
-
-(rule (lower (has_type (ty_32_or_64 ty)
-                       (rotl src (u64_from_iconst amt))))
-      (x64_rotl ty src
-                (const_to_type_masked_imm8 amt ty)))
 
 ;; `i128`.
 
@@ -858,31 +844,17 @@
 
 ;;;; Rules for `rotr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
-;; `i16` and `i8`: we need to extend the shift amount, or mask the
-;; constant.
+;; `i64` and smaller: we can rely on x86's rotate-amount masking since
+;;  we operate on the whole register. For const's we mask the constant.
 
-(rule (lower (has_type (ty_8_or_16 ty) (rotr src amt)))
-      (let ((amt_ Gpr (extend_to_gpr amt $I32 (ExtendKind.Zero))))
-        (x64_rotr ty src amt_)))
+(rule (lower (has_type (fits_in_64 ty) (rotr src amt)))
+        (x64_rotr ty src (put_masked_in_imm8_gpr amt ty)))
 
-(rule (lower (has_type (ty_8_or_16 ty)
+(rule (lower (has_type (fits_in_64 ty)
                        (rotr src (u64_from_iconst amt))))
       (x64_rotr ty src
                 (const_to_type_masked_imm8 amt ty)))
 
-;; `i64` and `i32`: we can rely on x86's rotate-amount masking since
-;;  we operate on the whole register.
-
-(rule (lower (has_type (ty_32_or_64 ty) (rotr src amt)))
-      ;; NB: Only the low bits of `amt` matter since we logically mask the
-      ;; shift amount to the value's bit width.
-      (let ((amt_ Gpr (lo_gpr amt)))
-        (x64_rotr ty src amt_)))
-
-(rule (lower (has_type (ty_32_or_64 ty)
-                       (rotr src (u64_from_iconst amt))))
-      (x64_rotr ty src
-                (const_to_type_masked_imm8 amt ty)))
 
 ;; `i128`.
 

diff --git a/cranelift/filetests/filetests/runtests/rotl-small-types.clif b/cranelift/filetests/filetests/runtests/rotl-small-types.clif
diff --git a/cranelift/filetests/filetests/runtests/rotl.clif b/cranelift/filetests/filetests/runtests/rotl.clif
@@ -117,6 +117,118 @@ block0(v0: i32, v1: i8):
 ; run: %rotl_i32_i8(0xe0000004, 66) == 0x80000013
 
 
+function %rotl_i16_i64(i16, i64) -> i16 {
+block0(v0: i16, v1: i64):
+    v2 = rotl.i16 v0, v1
+    return v2
+}
+; run: %rotl_i16_i64(0xe000, 0) == 0xe000
+; run: %rotl_i16_i64(0xe000, 1) == 0xc001
+; run: %rotl_i16_i64(0xef0f, 0) == 0xef0f
+; run: %rotl_i16_i64(0xef0f, 4) == 0xf0fe
+; run: %rotl_i16_i64(0xe004, 64) == 0xe004
+; run: %rotl_i16_i64(0xe004, 65) == 0xc009
+; run: %rotl_i16_i64(0xe004, 66) == 0x8013
+; run: %rotl_i16_i64(0xe004, 257) == 0xc009
+
+function %rotl_i16_i32(i16, i32) -> i16 {
+block0(v0: i16, v1: i32):
+    v2 = rotl.i16 v0, v1
+    return v2
+}
+; run: %rotl_i16_i32(0xe000, 0) == 0xe000
+; run: %rotl_i16_i32(0xe000, 1) == 0xc001
+; run: %rotl_i16_i32(0xef0f, 0) == 0xef0f
+; run: %rotl_i16_i32(0xef0f, 4) == 0xf0fe
+; run: %rotl_i16_i32(0xe004, 64) == 0xe004
+; run: %rotl_i16_i32(0xe004, 65) == 0xc009
+; run: %rotl_i16_i32(0xe004, 66) == 0x8013
+; run: %rotl_i16_i32(0xe004, 257) == 0xc009
+
+function %rotl_i16_i16(i16, i16) -> i16 {
+block0(v0: i16, v1: i16):
+    v2 = rotl.i16 v0, v1
+    return v2
+}
+; run: %rotl_i16_i16(0xe000, 0) == 0xe000
+; run: %rotl_i16_i16(0xe000, 1) == 0xc001
+; run: %rotl_i16_i16(0xef0f, 0) == 0xef0f
+; run: %rotl_i16_i16(0xef0f, 4) == 0xf0fe
+; run: %rotl_i16_i16(0xe004, 64) == 0xe004
+; run: %rotl_i16_i16(0xe004, 65) == 0xc009
+; run: %rotl_i16_i16(0xe004, 66) == 0x8013
+; run: %rotl_i16_i16(0xe004, 257) == 0xc009
+
+function %rotl_i16_i8(i16, i8) -> i16 {
+block0(v0: i16, v1: i8):
+    v2 = rotl.i16 v0, v1
+    return v2
+}
+; run: %rotl_i16_i8(0xe000, 0) == 0xe000
+; run: %rotl_i16_i8(0xe000, 1) == 0xc001
+; run: %rotl_i16_i8(0xef0f, 0) == 0xef0f
+; run: %rotl_i16_i8(0xef0f, 4) == 0xf0fe
+; run: %rotl_i16_i8(0xe004, 64) == 0xe004
+; run: %rotl_i16_i8(0xe004, 65) == 0xc009
+; run: %rotl_i16_i8(0xe004, 66) == 0x8013
+
+
+function %rotl_i8_i64(i8, i64) -> i8 {
+block0(v0: i8, v1: i64):
+    v2 = rotl.i8 v0, v1
+    return v2
+}
+; run: %rotl_i8_i64(0xe0, 0) == 0xe0
+; run: %rotl_i8_i64(0xe0, 1) == 0xc1
+; run: %rotl_i8_i64(0xef, 0) == 0xef
+; run: %rotl_i8_i64(0xef, 4) == 0xfe
+; run: %rotl_i8_i64(0xe4, 64) == 0xe4
+; run: %rotl_i8_i64(0xe4, 65) == 0xc9
+; run: %rotl_i8_i64(0xe4, 66) == 0x93
+; run: %rotl_i8_i64(0xe4, 257) == 0xc9
+
+function %rotl_i8_i32(i8, i32) -> i8 {
+block0(v0: i8, v1: i32):
+    v2 = rotl.i8 v0, v1
+    return v2
+}
+; run: %rotl_i8_i32(0xe0, 0) == 0xe0
+; run: %rotl_i8_i32(0xe0, 1) == 0xc1
+; run: %rotl_i8_i32(0xef, 0) == 0xef
+; run: %rotl_i8_i32(0xef, 4) == 0xfe
+; run: %rotl_i8_i32(0xe4, 64) == 0xe4
+; run: %rotl_i8_i32(0xe4, 65) == 0xc9
+; run: %rotl_i8_i32(0xe4, 66) == 0x93
+; run: %rotl_i8_i32(0xe4, 257) == 0xc9
+
+function %rotl_i8_i16(i8, i16) -> i8 {
+block0(v0: i8, v1: i16):
+    v2 = rotl.i8 v0, v1
+    return v2
+}
+; run: %rotl_i8_i16(0xe0, 0) == 0xe0
+; run: %rotl_i8_i16(0xe0, 1) == 0xc1
+; run: %rotl_i8_i16(0xef, 0) == 0xef
+; run: %rotl_i8_i16(0xef, 4) == 0xfe
+; run: %rotl_i8_i16(0xe4, 64) == 0xe4
+; run: %rotl_i8_i16(0xe4, 65) == 0xc9
+; run: %rotl_i8_i16(0xe4, 66) == 0x93
+; run: %rotl_i8_i16(0xe4, 257) == 0xc9
+
+function %rotl_i8_i8(i8, i8) -> i8 {
+block0(v0: i8, v1: i8):
+    v2 = rotl.i8 v0, v1
+    return v2
+}
+; run: %rotl_i8_i8(0xe0, 0) == 0xe0
+; run: %rotl_i8_i8(0xe0, 1) == 0xc1
+; run: %rotl_i8_i8(0xef, 0) == 0xef
+; run: %rotl_i8_i8(0xef, 4) == 0xfe
+; run: %rotl_i8_i8(0xe4, 64) == 0xe4
+; run: %rotl_i8_i8(0xe4, 65) == 0xc9
+; run: %rotl_i8_i8(0xe4, 66) == 0x93
+
+
 
 ;; This is a regression test for rotates on x64
 ;; See: https://github.com/bytecodealliance/wasmtime/pull/3610