From e463890f26ac82cdaac4be386d9489260b26bfd9 Mon Sep 17 00:00:00 2001 From: Damian Heaton <87125748+dheaton-arm@users.noreply.github.com> Date: Mon, 8 Aug 2022 19:35:43 +0100 Subject: [PATCH] Port `AvgRound` & `SqmulRoundSat` to ISLE (AArch64) (#4639) Ported the existing implementations of the following opcodes on AArch64 to ISLE: - `AvgRound` - Also introduced support for `i64x2` vectors, as per the docs. - `SqmulRoundSat` Copyright (c) 2022 Arm Limited --- .../codegen/meta/src/shared/instructions.rs | 2 + cranelift/codegen/src/isa/aarch64/inst.isle | 7 + .../codegen/src/isa/aarch64/inst/emit.rs | 29 +-- .../src/isa/aarch64/inst/emit_tests.rs | 180 ++++++++++++++++++ cranelift/codegen/src/isa/aarch64/lower.isle | 21 ++ .../codegen/src/isa/aarch64/lower_inst.rs | 45 +---- .../isa/aarch64/simd-arithmetic.clif | 81 ++++++++ .../filetests/runtests/simd-arithmetic.clif | 9 + .../filetests/runtests/simd-avg-round.clif | 51 +++++ 9 files changed, 369 insertions(+), 56 deletions(-) create mode 100644 cranelift/filetests/filetests/isa/aarch64/simd-arithmetic.clif create mode 100644 cranelift/filetests/filetests/runtests/simd-avg-round.clif diff --git a/cranelift/codegen/meta/src/shared/instructions.rs b/cranelift/codegen/meta/src/shared/instructions.rs index c3b7467aa162..83c2a1de94b4 100644 --- a/cranelift/codegen/meta/src/shared/instructions.rs +++ b/cranelift/codegen/meta/src/shared/instructions.rs @@ -592,6 +592,8 @@ fn define_simd_arithmetic( "avg_round", r#" Unsigned average with rounding: `a := (x + y + 1) // 2` + + The addition does not lose any information (such as from overflow). "#, &formats.binary, ) diff --git a/cranelift/codegen/src/isa/aarch64/inst.isle b/cranelift/codegen/src/isa/aarch64/inst.isle index f45caf58b594..0faa1280f92b 100644 --- a/cranelift/codegen/src/isa/aarch64/inst.isle +++ b/cranelift/codegen/src/isa/aarch64/inst.isle @@ -1551,6 +1551,13 @@ (_ Unit (emit (MInst.VecLanes op dst src size)))) dst)) +;; Helper for emitting `MInst.VecShiftImm` instructions. +(decl vec_shift_imm (VecShiftImmOp u8 Reg VectorSize) Reg) +(rule (vec_shift_imm op imm src size) + (let ((dst WritableReg (temp_writable_reg $I8X16)) + (_ Unit (emit (MInst.VecShiftImm op dst src size imm)))) + dst)) + ;; Helper for emitting `MInst.VecDup` instructions. (decl vec_dup (Reg VectorSize) Reg) (rule (vec_dup src size) diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs index d8a0f805aca6..c36e118c246c 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs @@ -1976,31 +1976,34 @@ impl MachInstEmit for Inst { } => { let rd = allocs.next_writable(rd); let rn = allocs.next(rn); - let (is_shr, template) = match op { - VecShiftImmOp::Ushr => (true, 0b_011_011110_0000_000_000001_00000_00000_u32), - VecShiftImmOp::Sshr => (true, 0b_010_011110_0000_000_000001_00000_00000_u32), - VecShiftImmOp::Shl => (false, 0b_010_011110_0000_000_010101_00000_00000_u32), + let (is_shr, mut template) = match op { + VecShiftImmOp::Ushr => (true, 0b_001_011110_0000_000_000001_00000_00000_u32), + VecShiftImmOp::Sshr => (true, 0b_000_011110_0000_000_000001_00000_00000_u32), + VecShiftImmOp::Shl => (false, 0b_000_011110_0000_000_010101_00000_00000_u32), }; + if size.is_128bits() { + template |= 0b1 << 30; + } let imm = imm as u32; // Deal with the somewhat strange encoding scheme for, and limits on, // the shift amount. - let immh_immb = match (size, is_shr) { - (VectorSize::Size64x2, true) if imm >= 1 && imm <= 64 => { + let immh_immb = match (size.lane_size(), is_shr) { + (ScalarSize::Size64, true) if imm >= 1 && imm <= 64 => { 0b_1000_000_u32 | (64 - imm) } - (VectorSize::Size32x4, true) if imm >= 1 && imm <= 32 => { + (ScalarSize::Size32, true) if imm >= 1 && imm <= 32 => { 0b_0100_000_u32 | (32 - imm) } - (VectorSize::Size16x8, true) if imm >= 1 && imm <= 16 => { + (ScalarSize::Size16, true) if imm >= 1 && imm <= 16 => { 0b_0010_000_u32 | (16 - imm) } - (VectorSize::Size8x16, true) if imm >= 1 && imm <= 8 => { + (ScalarSize::Size8, true) if imm >= 1 && imm <= 8 => { 0b_0001_000_u32 | (8 - imm) } - (VectorSize::Size64x2, false) if imm <= 63 => 0b_1000_000_u32 | imm, - (VectorSize::Size32x4, false) if imm <= 31 => 0b_0100_000_u32 | imm, - (VectorSize::Size16x8, false) if imm <= 15 => 0b_0010_000_u32 | imm, - (VectorSize::Size8x16, false) if imm <= 7 => 0b_0001_000_u32 | imm, + (ScalarSize::Size64, false) if imm <= 63 => 0b_1000_000_u32 | imm, + (ScalarSize::Size32, false) if imm <= 31 => 0b_0100_000_u32 | imm, + (ScalarSize::Size16, false) if imm <= 15 => 0b_0010_000_u32 | imm, + (ScalarSize::Size8, false) if imm <= 7 => 0b_0001_000_u32 | imm, _ => panic!( "aarch64: Inst::VecShiftImm: emit: invalid op/size/imm {:?}, {:?}, {:?}", op, size, imm diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs index 86b6a543f525..2ecb20d4df7f 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs @@ -3946,6 +3946,18 @@ fn test_aarch64_binemit() { "smax v8.4s, v12.4s, v14.4s", )); + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Urhadd, + rd: writable_vreg(8), + rn: vreg(1), + rm: vreg(3), + size: VectorSize::Size8x8, + }, + "2814232E", + "urhadd v8.8b, v1.8b, v3.8b", + )); + insns.push(( Inst::VecRRR { alu_op: VecALUOp::Urhadd, @@ -3958,6 +3970,18 @@ fn test_aarch64_binemit() { "urhadd v8.16b, v1.16b, v3.16b", )); + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Urhadd, + rd: writable_vreg(2), + rn: vreg(13), + rm: vreg(6), + size: VectorSize::Size16x4, + }, + "A215662E", + "urhadd v2.4h, v13.4h, v6.4h", + )); + insns.push(( Inst::VecRRR { alu_op: VecALUOp::Urhadd, @@ -3970,6 +3994,18 @@ fn test_aarch64_binemit() { "urhadd v2.8h, v13.8h, v6.8h", )); + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Urhadd, + rd: writable_vreg(8), + rn: vreg(12), + rm: vreg(14), + size: VectorSize::Size32x2, + }, + "8815AE2E", + "urhadd v8.2s, v12.2s, v14.2s", + )); + insns.push(( Inst::VecRRR { alu_op: VecALUOp::Urhadd, @@ -5123,6 +5159,126 @@ fn test_aarch64_binemit() { "sshr v3.8h, v19.8h, #1", )); + insns.push(( + Inst::VecShiftImm { + op: VecShiftImmOp::Ushr, + rd: writable_vreg(25), + rn: vreg(6), + imm: 8, + size: VectorSize::Size8x8, + }, + "D904082F", + "ushr v25.8b, v6.8b, #8", + )); + + insns.push(( + Inst::VecShiftImm { + op: VecShiftImmOp::Ushr, + rd: writable_vreg(5), + rn: vreg(21), + imm: 1, + size: VectorSize::Size8x8, + }, + "A5060F2F", + "ushr v5.8b, v21.8b, #1", + )); + + insns.push(( + Inst::VecShiftImm { + op: VecShiftImmOp::Ushr, + rd: writable_vreg(25), + rn: vreg(6), + imm: 8, + size: VectorSize::Size8x16, + }, + "D904086F", + "ushr v25.16b, v6.16b, #8", + )); + + insns.push(( + Inst::VecShiftImm { + op: VecShiftImmOp::Ushr, + rd: writable_vreg(5), + rn: vreg(21), + imm: 1, + size: VectorSize::Size8x16, + }, + "A5060F6F", + "ushr v5.16b, v21.16b, #1", + )); + + insns.push(( + Inst::VecShiftImm { + op: VecShiftImmOp::Ushr, + rd: writable_vreg(25), + rn: vreg(6), + imm: 16, + size: VectorSize::Size16x4, + }, + "D904102F", + "ushr v25.4h, v6.4h, #16", + )); + + insns.push(( + Inst::VecShiftImm { + op: VecShiftImmOp::Ushr, + rd: writable_vreg(5), + rn: vreg(21), + imm: 1, + size: VectorSize::Size16x4, + }, + "A5061F2F", + "ushr v5.4h, v21.4h, #1", + )); + + insns.push(( + Inst::VecShiftImm { + op: VecShiftImmOp::Ushr, + rd: writable_vreg(25), + rn: vreg(6), + imm: 16, + size: VectorSize::Size16x8, + }, + "D904106F", + "ushr v25.8h, v6.8h, #16", + )); + + insns.push(( + Inst::VecShiftImm { + op: VecShiftImmOp::Ushr, + rd: writable_vreg(5), + rn: vreg(21), + imm: 1, + size: VectorSize::Size16x8, + }, + "A5061F6F", + "ushr v5.8h, v21.8h, #1", + )); + + insns.push(( + Inst::VecShiftImm { + op: VecShiftImmOp::Ushr, + rd: writable_vreg(25), + rn: vreg(6), + imm: 32, + size: VectorSize::Size32x2, + }, + "D904202F", + "ushr v25.2s, v6.2s, #32", + )); + + insns.push(( + Inst::VecShiftImm { + op: VecShiftImmOp::Ushr, + rd: writable_vreg(5), + rn: vreg(21), + imm: 1, + size: VectorSize::Size32x2, + }, + "A5063F2F", + "ushr v5.2s, v21.2s, #1", + )); + insns.push(( Inst::VecShiftImm { op: VecShiftImmOp::Ushr, @@ -5147,6 +5303,30 @@ fn test_aarch64_binemit() { "ushr v5.4s, v21.4s, #1", )); + insns.push(( + Inst::VecShiftImm { + op: VecShiftImmOp::Ushr, + rd: writable_vreg(25), + rn: vreg(6), + imm: 64, + size: VectorSize::Size64x2, + }, + "D904406F", + "ushr v25.2d, v6.2d, #64", + )); + + insns.push(( + Inst::VecShiftImm { + op: VecShiftImmOp::Ushr, + rd: writable_vreg(5), + rn: vreg(21), + imm: 1, + size: VectorSize::Size64x2, + }, + "A5067F6F", + "ushr v5.2d, v21.2d, #1", + )); + insns.push(( Inst::VecShiftImm { op: VecShiftImmOp::Shl, diff --git a/cranelift/codegen/src/isa/aarch64/lower.isle b/cranelift/codegen/src/isa/aarch64/lower.isle index 293cd9bc0f13..0cee3acc08a0 100644 --- a/cranelift/codegen/src/isa/aarch64/lower.isle +++ b/cranelift/codegen/src/isa/aarch64/lower.isle @@ -233,6 +233,27 @@ (rule (lower (has_type (fits_in_32 ty) (iabs x))) (abs (OperandSize.Size32) (put_in_reg_sext32 x))) +;;;; Rules for `avg_round` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (has_type $I64X2 (avg_round x y))) + (let ((one Reg (splat_const 1 (VectorSize.Size64x2))) + (c Reg (orr_vec x y (VectorSize.Size64x2))) + (c Reg (and_vec c one (VectorSize.Size64x2))) + (x Reg (vec_shift_imm (VecShiftImmOp.Ushr) 1 x + (VectorSize.Size64x2))) + (y Reg (vec_shift_imm (VecShiftImmOp.Ushr) 1 y + (VectorSize.Size64x2))) + (sum Reg (add_vec x y (VectorSize.Size64x2)))) + (add_vec c sum (VectorSize.Size64x2)))) + +(rule (lower (has_type (lane_fits_in_32 ty) (avg_round x y))) + (vec_rrr (VecALUOp.Urhadd) x y (vector_size ty))) + +;;;; Rules for `sqmul_round_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (has_type ty @ (multi_lane _ _) (sqmul_round_sat x y))) + (vec_rrr (VecALUOp.Sqrdmulh) x y (vector_size ty))) + ;;;; Rules for `fadd` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type ty @ (multi_lane _ _) (fadd rn rm))) diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs index 8338d788df22..24d45d0c2706 100644 --- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs +++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs @@ -1502,27 +1502,7 @@ pub(crate) fn lower_insn_to_regs>( } Opcode::Iabs => implemented_in_isle(ctx), - Opcode::AvgRound => { - let ty = ty.unwrap(); - - if ty.lane_bits() == 64 { - return Err(CodegenError::Unsupported(format!( - "AvgRound: Unsupported type: {:?}", - ty - ))); - } - - let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); - let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); - let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None); - ctx.emit(Inst::VecRRR { - alu_op: VecALUOp::Urhadd, - rd, - rn, - rm, - size: VectorSize::from_ty(ty), - }); - } + Opcode::AvgRound => implemented_in_isle(ctx), Opcode::Snarrow | Opcode::Unarrow | Opcode::Uunarrow => implemented_in_isle(ctx), @@ -1583,28 +1563,7 @@ pub(crate) fn lower_insn_to_regs>( } }, - Opcode::SqmulRoundSat => { - let ty = ty.unwrap(); - - if !ty.is_vector() || (ty.lane_type() != I16 && ty.lane_type() != I32) { - return Err(CodegenError::Unsupported(format!( - "SqmulRoundSat: Unsupported type: {:?}", - ty - ))); - } - - let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); - let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); - let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None); - - ctx.emit(Inst::VecRRR { - alu_op: VecALUOp::Sqrdmulh, - rd, - rn, - rm, - size: VectorSize::from_ty(ty), - }); - } + Opcode::SqmulRoundSat => implemented_in_isle(ctx), Opcode::FcvtLowFromSint => { let ty = ty.unwrap(); diff --git a/cranelift/filetests/filetests/isa/aarch64/simd-arithmetic.clif b/cranelift/filetests/filetests/isa/aarch64/simd-arithmetic.clif new file mode 100644 index 000000000000..fc511d97161a --- /dev/null +++ b/cranelift/filetests/filetests/isa/aarch64/simd-arithmetic.clif @@ -0,0 +1,81 @@ +test compile precise-output +set unwind_info=false +target aarch64 + +function %average_rounding_i8x8(i8x8, i8x8) -> i8x8 { +block0(v0: i8x8, v1: i8x8): + v2 = avg_round v0, v1 + return v2 +} + +; block0: +; urhadd v0.8b, v0.8b, v1.8b +; ret + +function %average_rounding_i8x16(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = avg_round v0, v1 + return v2 +} + +; block0: +; urhadd v0.16b, v0.16b, v1.16b +; ret + +function %average_rounding_i16x4(i16x4, i16x4) -> i16x4 { +block0(v0: i16x4, v1: i16x4): + v2 = avg_round v0, v1 + return v2 +} + +; block0: +; urhadd v0.4h, v0.4h, v1.4h +; ret + +function %average_rounding_i16x8(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = avg_round v0, v1 + return v2 +} + +; block0: +; urhadd v0.8h, v0.8h, v1.8h +; ret + +function %average_rounding_i32x2(i32x2, i32x2) -> i32x2 { +block0(v0: i32x2, v1: i32x2): + v2 = avg_round v0, v1 + return v2 +} + +; block0: +; urhadd v0.2s, v0.2s, v1.2s +; ret + +function %average_rounding_i32x4(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = avg_round v0, v1 + return v2 +} + +; block0: +; urhadd v0.4s, v0.4s, v1.4s +; ret + +function %average_rounding_i64x2(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = avg_round v0, v1 + return v2 +} + +; block0: +; movz x6, #1 +; dup v6.2d, x6 +; orr v17.16b, v0.16b, v1.16b +; and v19.16b, v17.16b, v6.16b +; ushr v21.2d, v0.2d, #1 +; ushr v23.2d, v1.2d, #1 +; add v25.2d, v21.2d, v23.2d +; add v0.2d, v19.2d, v25.2d +; ret + diff --git a/cranelift/filetests/filetests/runtests/simd-arithmetic.clif b/cranelift/filetests/filetests/runtests/simd-arithmetic.clif index 58a0dc1c21f6..ebcfa3246eab 100644 --- a/cranelift/filetests/filetests/runtests/simd-arithmetic.clif +++ b/cranelift/filetests/filetests/runtests/simd-arithmetic.clif @@ -1,3 +1,5 @@ +; the interpreter does not currently support some of these instructions +; such as `avg_round` on SIMD values. test run target aarch64 target s390x @@ -172,6 +174,13 @@ block0(v0: f32x4): } ; run: %fabs_f32x4([0x0.0 -0x1.0 0x2.0 -0x3.0]) == [0x0.0 0x1.0 0x2.0 0x3.0] +function %average_rounding_i8x16(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = avg_round v0, v1 + return v2 +} +; run: %average_rounding_i8x16([0 0 0 1 42 19 -1 0xff 5 0 0 0 1 42 19 -1], [0 1 2 4 42 18 -1 0 10 0 1 2 4 42 18 -1]) == [0 1 1 3 42 19 -1 0x80 8 0 1 1 3 42 19 -1] + function %average_rounding_i16x8(i16x8, i16x8) -> i16x8 { block0(v0: i16x8, v1: i16x8): v2 = avg_round v0, v1 diff --git a/cranelift/filetests/filetests/runtests/simd-avg-round.clif b/cranelift/filetests/filetests/runtests/simd-avg-round.clif new file mode 100644 index 000000000000..69311fd5d7df --- /dev/null +++ b/cranelift/filetests/filetests/runtests/simd-avg-round.clif @@ -0,0 +1,51 @@ +; the interpreter does not currently support SIMD `avg_round`. +test run +target aarch64 +; x86_64 and s390x do not currently support 64-bit vectors, or +; `avg_round` on `i64x2` values. +; x86_64 also does not currently support `avg_round.i32x4`. + +function %average_rounding_i8x8(i8x8, i8x8) -> i8x8 { +block0(v0: i8x8, v1: i8x8): + v2 = avg_round v0, v1 + return v2 +} +; run: %average_rounding_i8x8([0 0 0 1 42 19 -1 0xff], [0 1 2 4 42 18 -1 0]) == [0 1 1 3 42 19 -1 0x80] + +function %average_rounding_i16x4(i16x4, i16x4) -> i16x4 { +block0(v0: i16x4, v1: i16x4): + v2 = avg_round v0, v1 + return v2 +} +; run: %average_rounding_i16x4([0 0 0 1], [0 1 2 4]) == [0 1 1 3] +; run: %average_rounding_i16x4([42 19 -1 0xffff], [42 18 -1 0]) == [42 19 -1 0x8000] + +function %average_rounding_i32x2(i32x2, i32x2) -> i32x2 { +block0(v0: i32x2, v1: i32x2): + v2 = avg_round v0, v1 + return v2 +} +; run: %average_rounding_i32x2([0 0], [0 1]) == [0 1] +; run: %average_rounding_i32x2([0 1], [2 4]) == [1 3] +; run: %average_rounding_i32x2([42 19], [42 18]) == [42 19] +; run: %average_rounding_i32x2([-1 0xffffffff], [-1 0]) == [-1 0x80000000] +; run: %average_rounding_i32x2([0xffffffff 0xfffffffd], [10 0xffffffff]) == [0x80000005 0xfffffffe] + +function %average_rounding_i32x4(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = avg_round v0, v1 + return v2 +} +; run: %average_rounding_i32x4([0 0 0 0xffffffff], [0 1 2 0]) == [0 1 1 0x80000000] +; run: %average_rounding_i32x4([1 42 19 -1], [4 42 18 -1]) == [3 42 19 -1] + +function %average_rounding_i64x2(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = avg_round v0, v1 + return v2 +} +; run: %average_rounding_i64x2([0 0], [0 1]) == [0 1] +; run: %average_rounding_i64x2([0 1], [2 4]) == [1 3] +; run: %average_rounding_i64x2([42 19], [42 18]) == [42 19] +; run: %average_rounding_i64x2([-1 0xffffffffffffffff], [-1 0]) == [-1 0x8000000000000000] +; run: %average_rounding_i64x2([0xffffffffffffffff 0xfffffffffffffffd], [10 0xffffffffffffffff]) == [0x8000000000000005 0xfffffffffffffffe]