diff --git a/build.rs b/build.rs index ee794a7b734e..835cc8788ea5 100644 --- a/build.rs +++ b/build.rs @@ -180,8 +180,8 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool { _ => (), }, "Cranelift" => match (testsuite, testname) { - ("simd", "simd_store") => return false, ("simd", "simd_i8x16_cmp") => return false, + ("simd", "simd_store") => return false, // Most simd tests are known to fail on aarch64 for now, it's going // to be a big chunk of work to implement them all there! ("simd", _) if target.contains("aarch64") => return true, diff --git a/cranelift/codegen/src/isa/aarch64/abi.rs b/cranelift/codegen/src/isa/aarch64/abi.rs index 9439ef55dfc3..7ddf5ecb1b32 100644 --- a/cranelift/codegen/src/isa/aarch64/abi.rs +++ b/cranelift/codegen/src/isa/aarch64/abi.rs @@ -406,7 +406,7 @@ fn in_int_reg(ty: ir::Type) -> bool { fn in_vec_reg(ty: ir::Type) -> bool { match ty { - types::F32 | types::F64 | types::I8X16 => true, + types::F32 | types::F64 | types::I8X16 | types::I16X8 | types::I32X4 | types::I64X2 => true, _ => false, } } diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs index 81b238adc48e..60e2480cb08b 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs @@ -1149,6 +1149,23 @@ impl MachInstEmit for Inst { | machreg_to_gpr(rd.to_reg()), ); } + &Inst::VecExtend { t, rd, rn } => { + let (u, immh) = match t { + VecExtendOp::Sxtl8 => (0b0, 0b001), + VecExtendOp::Sxtl16 => (0b0, 0b010), + VecExtendOp::Sxtl32 => (0b0, 0b100), + VecExtendOp::Uxtl8 => (0b1, 0b001), + VecExtendOp::Uxtl16 => (0b1, 0b010), + VecExtendOp::Uxtl32 => (0b1, 0b100), + }; + sink.put4( + 0b000_011110_0000_000_101001_00000_00000 + | (u << 29) + | (immh << 19) + | (machreg_to_vec(rn) << 5) + | machreg_to_vec(rd.to_reg()), + ); + } &Inst::VecRRR { rd, rn, diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs index 6f302501d2c1..58985852857d 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs @@ -1826,6 +1826,60 @@ fn test_aarch64_binemit() { "E5979F9A", "cset x5, hi", )); + insns.push(( + Inst::VecExtend { + t: VecExtendOp::Sxtl8, + rd: writable_vreg(4), + rn: vreg(27), + }, + "64A7080F", + "sxtl v4.8h, v27.8b", + )); + insns.push(( + Inst::VecExtend { + t: VecExtendOp::Sxtl16, + rd: writable_vreg(17), + rn: vreg(19), + }, + "71A6100F", + "sxtl v17.4s, v19.4h", + )); + insns.push(( + Inst::VecExtend { + t: VecExtendOp::Sxtl32, + rd: writable_vreg(30), + rn: vreg(6), + }, + "DEA4200F", + "sxtl v30.2d, v6.2s", + )); + insns.push(( + Inst::VecExtend { + t: VecExtendOp::Uxtl8, + rd: writable_vreg(3), + rn: vreg(29), + }, + "A3A7082F", + "uxtl v3.8h, v29.8b", + )); + insns.push(( + Inst::VecExtend { + t: VecExtendOp::Uxtl16, + rd: writable_vreg(15), + rn: vreg(12), + }, + "8FA5102F", + "uxtl v15.4s, v12.4h", + )); + insns.push(( + Inst::VecExtend { + t: VecExtendOp::Uxtl32, + rd: writable_vreg(28), + rn: vreg(2), + }, + "5CA4202F", + "uxtl v28.2d, v2.2s", + )); insns.push(( Inst::VecRRR { rd: writable_vreg(21), diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs index 6fb559dbb97b..f5b6ecde259b 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs @@ -5,7 +5,8 @@ use crate::binemit::CodeOffset; use crate::ir::types::{ - B1, B16, B32, B64, B8, B8X16, F32, F32X2, F64, FFLAGS, I128, I16, I32, I64, I8, I8X16, IFLAGS, + B1, B16, B32, B64, B8, B8X16, F32, F32X2, F64, FFLAGS, I128, I16, I16X4, I16X8, I32, I32X2, + I32X4, I64, I64X2, I8, I8X16, I8X8, IFLAGS, }; use crate::ir::{ExternalName, Opcode, SourceLoc, TrapCode, Type}; use crate::machinst::*; @@ -186,6 +187,23 @@ pub enum FpuRoundMode { Nearest64, } +/// Type of vector element extensions. +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +pub enum VecExtendOp { + /// Signed extension of 8-bit elements + Sxtl8, + /// Signed extension of 16-bit elements + Sxtl16, + /// Signed extension of 32-bit elements + Sxtl32, + /// Unsigned extension of 8-bit elements + Uxtl8, + /// Unsigned extension of 16-bit elements + Uxtl16, + /// Unsigned extension of 32-bit elements + Uxtl32, +} + /// A vector ALU operation. #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] pub enum VecALUOp { @@ -667,6 +685,13 @@ pub enum Inst { rn: Reg, }, + /// Vector extend. + VecExtend { + t: VecExtendOp, + rd: Writable, + rn: Reg, + }, + /// A vector ALU op. VecRRR { alu_op: VecALUOp, @@ -1208,6 +1233,10 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) { collector.add_def(rd); collector.add_use(rn); } + &Inst::VecExtend { rd, rn, .. } => { + collector.add_def(rd); + collector.add_use(rn); + } &Inst::VecRRR { rd, rn, rm, .. } => { collector.add_def(rd); collector.add_use(rn); @@ -1752,6 +1781,14 @@ fn aarch64_map_regs(inst: &mut Inst, mapper: &RUM) { map_def(mapper, rd); map_use(mapper, rn); } + &mut Inst::VecExtend { + ref mut rd, + ref mut rn, + .. + } => { + map_def(mapper, rd); + map_use(mapper, rn); + } &mut Inst::VecRRR { ref mut rd, ref mut rn, @@ -1940,7 +1977,7 @@ impl MachInst for Inst { I8 | I16 | I32 | I64 | B1 | B8 | B16 | B32 | B64 => Ok(RegClass::I64), F32 | F64 => Ok(RegClass::V128), IFLAGS | FFLAGS => Ok(RegClass::I64), - I8X16 => Ok(RegClass::V128), + I8X16 | I16X8 | I32X4 | I64X2 => Ok(RegClass::V128), B8X16 => Ok(RegClass::V128), _ => Err(CodegenError::Unsupported(format!( "Unexpected SSA-value type: {}", @@ -2515,6 +2552,19 @@ impl ShowWithRRU for Inst { let rn = rn.show_rru(mb_rru); format!("mov {}, {}.d[0]", rd, rn) } + &Inst::VecExtend { t, rd, rn } => { + let (op, dest, src) = match t { + VecExtendOp::Sxtl8 => ("sxtl", I16X8, I8X8), + VecExtendOp::Sxtl16 => ("sxtl", I32X4, I16X4), + VecExtendOp::Sxtl32 => ("sxtl", I64X2, I32X2), + VecExtendOp::Uxtl8 => ("uxtl", I16X8, I8X8), + VecExtendOp::Uxtl16 => ("uxtl", I32X4, I16X4), + VecExtendOp::Uxtl32 => ("uxtl", I64X2, I32X2), + }; + let rd = show_vreg_vector(rd.to_reg(), mb_rru, dest); + let rn = show_vreg_vector(rn, mb_rru, src); + format!("{} {}, {}", op, rd, rn) + } &Inst::VecRRR { rd, rn, diff --git a/cranelift/codegen/src/isa/aarch64/inst/regs.rs b/cranelift/codegen/src/isa/aarch64/inst/regs.rs index cebcf6ec307e..59841ed82c9c 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/regs.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/regs.rs @@ -321,6 +321,12 @@ pub fn show_vreg_vector(reg: Reg, mb_rru: Option<&RealRegUniverse>, ty: Type) -> match ty { I8X16 => s.push_str(".16b"), F32X2 => s.push_str(".2s"), + I8X8 => s.push_str(".8b"), + I16X4 => s.push_str(".4h"), + I16X8 => s.push_str(".8h"), + I32X2 => s.push_str(".2s"), + I32X4 => s.push_str(".4s"), + I64X2 => s.push_str(".2d"), _ => unimplemented!(), } diff --git a/cranelift/codegen/src/isa/aarch64/lower.rs b/cranelift/codegen/src/isa/aarch64/lower.rs index b50c2a5edce4..2cf93296e908 100644 --- a/cranelift/codegen/src/isa/aarch64/lower.rs +++ b/cranelift/codegen/src/isa/aarch64/lower.rs @@ -716,7 +716,8 @@ pub fn ty_bits(ty: Type) -> usize { B64 | I64 | F64 => 64, B128 | I128 => 128, IFLAGS | FFLAGS => 32, - I8X16 | B8X16 => 128, + I8X8 | I16X4 | I32X2 => 64, + B8X16 | I8X16 | I16X8 | I32X4 | I64X2 => 128, _ => panic!("ty_bits() on unknown type: {:?}", ty), } } @@ -724,7 +725,7 @@ pub fn ty_bits(ty: Type) -> usize { pub(crate) fn ty_is_int(ty: Type) -> bool { match ty { B1 | B8 | I8 | B16 | I16 | B32 | I32 | B64 | I64 => true, - F32 | F64 | B128 | I128 | I8X16 => false, + F32 | F64 | B128 | I128 | I8X8 | I8X16 | I16X4 | I16X8 | I32X2 | I32X4 | I64X2 => false, IFLAGS | FFLAGS => panic!("Unexpected flags type"), _ => panic!("ty_is_int() on unknown type: {:?}", ty), } diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs index 02f5feb9988b..a52b6fba0297 100644 --- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs +++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs @@ -829,7 +829,13 @@ pub(crate) fn lower_insn_to_regs>( | Opcode::Uload16Complex | Opcode::Sload16Complex | Opcode::Uload32Complex - | Opcode::Sload32Complex => { + | Opcode::Sload32Complex + | Opcode::Sload8x8 + | Opcode::Uload8x8 + | Opcode::Sload16x4 + | Opcode::Uload16x4 + | Opcode::Sload32x2 + | Opcode::Uload32x2 => { let off = ldst_offset(ctx.data(insn)).unwrap(); let elem_ty = match op { Opcode::Sload8 | Opcode::Uload8 | Opcode::Sload8Complex | Opcode::Uload8Complex => { @@ -844,6 +850,9 @@ pub(crate) fn lower_insn_to_regs>( | Opcode::Sload32Complex | Opcode::Uload32Complex => I32, Opcode::Load | Opcode::LoadComplex => ctx.output_ty(insn, 0), + Opcode::Sload8x8 | Opcode::Uload8x8 => I8X8, + Opcode::Sload16x4 | Opcode::Uload16x4 => I16X4, + Opcode::Sload32x2 | Opcode::Uload32x2 => I32X2, _ => unreachable!(), }; let sign_extend = match op { @@ -877,10 +886,30 @@ pub(crate) fn lower_insn_to_regs>( (32, true, false) => Inst::SLoad32 { rd, mem, srcloc }, (32, _, true) => Inst::FpuLoad32 { rd, mem, srcloc }, (64, _, false) => Inst::ULoad64 { rd, mem, srcloc }, + // Note that we treat some of the vector loads as scalar floating-point loads, + // which is correct in a little endian environment. (64, _, true) => Inst::FpuLoad64 { rd, mem, srcloc }, (128, _, _) => Inst::FpuLoad128 { rd, mem, srcloc }, _ => panic!("Unsupported size in load"), }); + + let vec_extend = match op { + Opcode::Sload8x8 => Some(VecExtendOp::Sxtl8), + Opcode::Uload8x8 => Some(VecExtendOp::Uxtl8), + Opcode::Sload16x4 => Some(VecExtendOp::Sxtl16), + Opcode::Uload16x4 => Some(VecExtendOp::Uxtl16), + Opcode::Sload32x2 => Some(VecExtendOp::Sxtl32), + Opcode::Uload32x2 => Some(VecExtendOp::Uxtl32), + _ => None, + }; + + if let Some(t) = vec_extend { + ctx.emit(Inst::VecExtend { + t, + rd, + rn: rd.to_reg(), + }); + } } Opcode::Store @@ -1433,17 +1462,11 @@ pub(crate) fn lower_insn_to_regs>( | Opcode::Extractlane | Opcode::ScalarToVector | Opcode::Swizzle - | Opcode::Uload8x8 | Opcode::Uload8x8Complex - | Opcode::Sload8x8 | Opcode::Sload8x8Complex - | Opcode::Uload16x4 | Opcode::Uload16x4Complex - | Opcode::Sload16x4 | Opcode::Sload16x4Complex - | Opcode::Uload32x2 | Opcode::Uload32x2Complex - | Opcode::Sload32x2 | Opcode::Sload32x2Complex => { // TODO panic!("Vector ops not implemented.");