diff --git a/build_system/tests.rs b/build_system/tests.rs index 1e24d1b113fe6..10736ff9a55c8 100644 --- a/build_system/tests.rs +++ b/build_system/tests.rs @@ -99,6 +99,7 @@ const BASE_SYSROOT_SUITE: &[TestCase] = &[ TestCase::build_bin_and_run("aot.mod_bench", "example/mod_bench.rs", &[]), TestCase::build_bin_and_run("aot.issue-72793", "example/issue-72793.rs", &[]), TestCase::build_bin("aot.issue-59326", "example/issue-59326.rs"), + TestCase::build_bin_and_run("aot.neon", "example/neon.rs", &[]), ]; pub(crate) static RAND_REPO: GitRepo = GitRepo::github( diff --git a/config.txt b/config.txt index 7ff805e58d967..2ccdc7d78748a 100644 --- a/config.txt +++ b/config.txt @@ -42,6 +42,7 @@ aot.float-minmax-pass aot.mod_bench aot.issue-72793 aot.issue-59326 +aot.neon testsuite.extended_sysroot test.rust-random/rand diff --git a/example/neon.rs b/example/neon.rs new file mode 100644 index 0000000000000..bad26947967dc --- /dev/null +++ b/example/neon.rs @@ -0,0 +1,234 @@ +// Most of these tests are copied from https://github.com/japaric/stdsimd/blob/0f4413d01c4f0c3ffbc5a69e9a37fbc7235b31a9/coresimd/arm/neon.rs + +#![feature(portable_simd)] + +#[cfg(target_arch = "aarch64")] +use std::arch::aarch64::*; +use std::mem::transmute; +use std::simd::*; + +#[cfg(target_arch = "aarch64")] +unsafe fn test_vpmin_s8() { + let a = i8x8::from([1, -2, 3, -4, 5, 6, 7, 8]); + let b = i8x8::from([0, 3, 2, 5, 4, 7, 6, 9]); + let e = i8x8::from([-2, -4, 5, 7, 0, 2, 4, 6]); + let r: i8x8 = transmute(vpmin_s8(transmute(a), transmute(b))); + assert_eq!(r, e); +} + +#[cfg(target_arch = "aarch64")] +unsafe fn test_vpmin_s16() { + let a = i16x4::from([1, 2, 3, -4]); + let b = i16x4::from([0, 3, 2, 5]); + let e = i16x4::from([1, -4, 0, 2]); + let r: i16x4 = transmute(vpmin_s16(transmute(a), transmute(b))); + assert_eq!(r, e); +} + +#[cfg(target_arch = "aarch64")] +unsafe fn test_vpmin_s32() { + let a = i32x2::from([1, -2]); + let b = i32x2::from([0, 3]); + let e = i32x2::from([-2, 0]); + let r: i32x2 = transmute(vpmin_s32(transmute(a), transmute(b))); + assert_eq!(r, e); +} + +#[cfg(target_arch = "aarch64")] +unsafe fn test_vpmin_u8() { + let a = u8x8::from([1, 2, 3, 4, 5, 6, 7, 8]); + let b = u8x8::from([0, 3, 2, 5, 4, 7, 6, 9]); + let e = u8x8::from([1, 3, 5, 7, 0, 2, 4, 6]); + let r: u8x8 = transmute(vpmin_u8(transmute(a), transmute(b))); + assert_eq!(r, e); +} + +#[cfg(target_arch = "aarch64")] +unsafe fn test_vpmin_u16() { + let a = u16x4::from([1, 2, 3, 4]); + let b = u16x4::from([0, 3, 2, 5]); + let e = u16x4::from([1, 3, 0, 2]); + let r: u16x4 = transmute(vpmin_u16(transmute(a), transmute(b))); + assert_eq!(r, e); +} + +#[cfg(target_arch = "aarch64")] +unsafe fn test_vpmin_u32() { + let a = u32x2::from([1, 2]); + let b = u32x2::from([0, 3]); + let e = u32x2::from([1, 0]); + let r: u32x2 = transmute(vpmin_u32(transmute(a), transmute(b))); + assert_eq!(r, e); +} + +#[cfg(target_arch = "aarch64")] +unsafe fn test_vpmin_f32() { + let a = f32x2::from([1., -2.]); + let b = f32x2::from([0., 3.]); + let e = f32x2::from([-2., 0.]); + let r: f32x2 = transmute(vpmin_f32(transmute(a), transmute(b))); + assert_eq!(r, e); +} + +#[cfg(target_arch = "aarch64")] +unsafe fn test_vpmax_s8() { + let a = i8x8::from([1, -2, 3, -4, 5, 6, 7, 8]); + let b = i8x8::from([0, 3, 2, 5, 4, 7, 6, 9]); + let e = i8x8::from([1, 3, 6, 8, 3, 5, 7, 9]); + let r: i8x8 = transmute(vpmax_s8(transmute(a), transmute(b))); + assert_eq!(r, e); +} + +#[cfg(target_arch = "aarch64")] +unsafe fn test_vpmax_s16() { + let a = i16x4::from([1, 2, 3, -4]); + let b = i16x4::from([0, 3, 2, 5]); + let e = i16x4::from([2, 3, 3, 5]); + let r: i16x4 = transmute(vpmax_s16(transmute(a), transmute(b))); + assert_eq!(r, e); +} + +#[cfg(target_arch = "aarch64")] +unsafe fn test_vpmax_s32() { + let a = i32x2::from([1, -2]); + let b = i32x2::from([0, 3]); + let e = i32x2::from([1, 3]); + let r: i32x2 = transmute(vpmax_s32(transmute(a), transmute(b))); + assert_eq!(r, e); +} + +#[cfg(target_arch = "aarch64")] +unsafe fn test_vpmax_u8() { + let a = u8x8::from([1, 2, 3, 4, 5, 6, 7, 8]); + let b = u8x8::from([0, 3, 2, 5, 4, 7, 6, 9]); + let e = u8x8::from([2, 4, 6, 8, 3, 5, 7, 9]); + let r: u8x8 = transmute(vpmax_u8(transmute(a), transmute(b))); + assert_eq!(r, e); +} + +#[cfg(target_arch = "aarch64")] +unsafe fn test_vpmax_u16() { + let a = u16x4::from([1, 2, 3, 4]); + let b = u16x4::from([0, 3, 2, 5]); + let e = u16x4::from([2, 4, 3, 5]); + let r: u16x4 = transmute(vpmax_u16(transmute(a), transmute(b))); + assert_eq!(r, e); +} + +#[cfg(target_arch = "aarch64")] +unsafe fn test_vpmax_u32() { + let a = u32x2::from([1, 2]); + let b = u32x2::from([0, 3]); + let e = u32x2::from([2, 3]); + let r: u32x2 = transmute(vpmax_u32(transmute(a), transmute(b))); + assert_eq!(r, e); +} + +#[cfg(target_arch = "aarch64")] +unsafe fn test_vpmax_f32() { + let a = f32x2::from([1., -2.]); + let b = f32x2::from([0., 3.]); + let e = f32x2::from([1., 3.]); + let r: f32x2 = transmute(vpmax_f32(transmute(a), transmute(b))); + assert_eq!(r, e); +} + +#[cfg(target_arch = "aarch64")] +unsafe fn test_vpadd_s16() { + let a = i16x4::from([1, 2, 3, 4]); + let b = i16x4::from([0, -1, -2, -3]); + let r: i16x4 = transmute(vpadd_s16(transmute(a), transmute(b))); + let e = i16x4::from([3, 7, -1, -5]); + assert_eq!(r, e); +} +#[cfg(target_arch = "aarch64")] +unsafe fn test_vpadd_s32() { + let a = i32x2::from([1, 2]); + let b = i32x2::from([0, -1]); + let r: i32x2 = transmute(vpadd_s32(transmute(a), transmute(b))); + let e = i32x2::from([3, -1]); + assert_eq!(r, e); +} +#[cfg(target_arch = "aarch64")] +unsafe fn test_vpadd_s8() { + let a = i8x8::from([1, 2, 3, 4, 5, 6, 7, 8]); + let b = i8x8::from([0, -1, -2, -3, -4, -5, -6, -7]); + let r: i8x8 = transmute(vpadd_s8(transmute(a), transmute(b))); + let e = i8x8::from([3, 7, 11, 15, -1, -5, -9, -13]); + assert_eq!(r, e); +} +#[cfg(target_arch = "aarch64")] +unsafe fn test_vpadd_u16() { + let a = u16x4::from([1, 2, 3, 4]); + let b = u16x4::from([30, 31, 32, 33]); + let r: u16x4 = transmute(vpadd_u16(transmute(a), transmute(b))); + let e = u16x4::from([3, 7, 61, 65]); + assert_eq!(r, e); +} +#[cfg(target_arch = "aarch64")] +unsafe fn test_vpadd_u32() { + let a = u32x2::from([1, 2]); + let b = u32x2::from([30, 31]); + let r: u32x2 = transmute(vpadd_u32(transmute(a), transmute(b))); + let e = u32x2::from([3, 61]); + assert_eq!(r, e); +} +#[cfg(target_arch = "aarch64")] +unsafe fn test_vpadd_u8() { + let a = u8x8::from([1, 2, 3, 4, 5, 6, 7, 8]); + let b = u8x8::from([30, 31, 32, 33, 34, 35, 36, 37]); + let r: u8x8 = transmute(vpadd_u8(transmute(a), transmute(b))); + let e = u8x8::from([3, 7, 11, 15, 61, 65, 69, 73]); + assert_eq!(r, e); +} + +#[cfg(target_arch = "aarch64")] +unsafe fn test_vqsub_u8() { + let a = u8x8::from([1, 2, 3, 4, 5, 6, 7, 0xff]); + let b = u8x8::from([30, 1, 1, 1, 34, 0xff, 36, 37]); + let r: u8x8 = transmute(vqsub_u8(transmute(a), transmute(b))); + let e = u8x8::from([0, 1, 2, 3, 0, 0, 0, 218]); + assert_eq!(r, e); +} + +#[cfg(target_arch = "aarch64")] +unsafe fn test_vqadd_u8() { + let a = u8x8::from([1, 2, 3, 4, 5, 6, 7, 0xff]); + let b = u8x8::from([30, 1, 1, 1, 34, 0xff, 36, 37]); + let r: u8x8 = transmute(vqadd_u8(transmute(a), transmute(b))); + let e = u8x8::from([31, 3, 4, 5, 39, 0xff, 43, 0xff]); + assert_eq!(r, e); +} + +#[cfg(target_arch = "aarch64")] +fn main() { + unsafe { + test_vpmin_s8(); + test_vpmin_s16(); + test_vpmin_s32(); + test_vpmin_u8(); + test_vpmin_u16(); + test_vpmin_u32(); + test_vpmin_f32(); + test_vpmax_s8(); + test_vpmax_s16(); + test_vpmax_s32(); + test_vpmax_u8(); + test_vpmax_u16(); + test_vpmax_u32(); + test_vpmax_f32(); + + test_vpadd_s16(); + test_vpadd_s32(); + test_vpadd_s8(); + test_vpadd_u16(); + test_vpadd_u32(); + test_vpadd_u8(); + + test_vqsub_u8(); + test_vqadd_u8(); + } +} + +#[cfg(not(target_arch = "aarch64"))] +fn main() {} diff --git a/src/intrinsics/llvm_aarch64.rs b/src/intrinsics/llvm_aarch64.rs index 0c211a06dc4a7..ee098be1fce6b 100644 --- a/src/intrinsics/llvm_aarch64.rs +++ b/src/intrinsics/llvm_aarch64.rs @@ -44,7 +44,9 @@ pub(crate) fn codegen_aarch64_llvm_intrinsic_call<'tcx>( }); } - _ if intrinsic.starts_with("llvm.aarch64.neon.sqadd.v") => { + _ if intrinsic.starts_with("llvm.aarch64.neon.sqadd.v") + || intrinsic.starts_with("llvm.aarch64.neon.uqadd.v") => + { intrinsic_args!(fx, args => (x, y); intrinsic); simd_pair_for_each_lane_typed(fx, x, y, ret, &|fx, x_lane, y_lane| { @@ -52,7 +54,9 @@ pub(crate) fn codegen_aarch64_llvm_intrinsic_call<'tcx>( }); } - _ if intrinsic.starts_with("llvm.aarch64.neon.sqsub.v") => { + _ if intrinsic.starts_with("llvm.aarch64.neon.sqsub.v") + || intrinsic.starts_with("llvm.aarch64.neon.uqsub.v") => + { intrinsic_args!(fx, args => (x, y); intrinsic); simd_pair_for_each_lane_typed(fx, x, y, ret, &|fx, x_lane, y_lane| { @@ -156,6 +160,90 @@ pub(crate) fn codegen_aarch64_llvm_intrinsic_call<'tcx>( }); } + _ if intrinsic.starts_with("llvm.aarch64.neon.umaxp.v") => { + intrinsic_args!(fx, args => (x, y); intrinsic); + + simd_horizontal_pair_for_each_lane( + fx, + x, + y, + ret, + &|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().umax(x_lane, y_lane), + ); + } + + _ if intrinsic.starts_with("llvm.aarch64.neon.smaxp.v") => { + intrinsic_args!(fx, args => (x, y); intrinsic); + + simd_horizontal_pair_for_each_lane( + fx, + x, + y, + ret, + &|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().smax(x_lane, y_lane), + ); + } + + _ if intrinsic.starts_with("llvm.aarch64.neon.uminp.v") => { + intrinsic_args!(fx, args => (x, y); intrinsic); + + simd_horizontal_pair_for_each_lane( + fx, + x, + y, + ret, + &|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().umin(x_lane, y_lane), + ); + } + + _ if intrinsic.starts_with("llvm.aarch64.neon.sminp.v") => { + intrinsic_args!(fx, args => (x, y); intrinsic); + + simd_horizontal_pair_for_each_lane( + fx, + x, + y, + ret, + &|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().smin(x_lane, y_lane), + ); + } + + _ if intrinsic.starts_with("llvm.aarch64.neon.fminp.v") => { + intrinsic_args!(fx, args => (x, y); intrinsic); + + simd_horizontal_pair_for_each_lane( + fx, + x, + y, + ret, + &|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().fmin(x_lane, y_lane), + ); + } + + _ if intrinsic.starts_with("llvm.aarch64.neon.fmaxp.v") => { + intrinsic_args!(fx, args => (x, y); intrinsic); + + simd_horizontal_pair_for_each_lane( + fx, + x, + y, + ret, + &|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().fmax(x_lane, y_lane), + ); + } + + _ if intrinsic.starts_with("llvm.aarch64.neon.addp.v") => { + intrinsic_args!(fx, args => (x, y); intrinsic); + + simd_horizontal_pair_for_each_lane( + fx, + x, + y, + ret, + &|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().iadd(x_lane, y_lane), + ); + } + // FIXME generalize vector types "llvm.aarch64.neon.tbl1.v16i8" => { intrinsic_args!(fx, args => (t, idx); intrinsic); @@ -172,25 +260,6 @@ pub(crate) fn codegen_aarch64_llvm_intrinsic_call<'tcx>( } } - // FIXME generalize vector types - "llvm.aarch64.neon.umaxp.v16i8" => { - intrinsic_args!(fx, args => (a, b); intrinsic); - - // FIXME add helper for horizontal pairwise operations - for i in 0..8 { - let lane1 = a.value_lane(fx, i * 2).load_scalar(fx); - let lane2 = a.value_lane(fx, i * 2 + 1).load_scalar(fx); - let res = fx.bcx.ins().umax(lane1, lane2); - ret.place_lane(fx, i).to_ptr().store(fx, res, MemFlags::trusted()); - } - for i in 0..8 { - let lane1 = b.value_lane(fx, i * 2).load_scalar(fx); - let lane2 = b.value_lane(fx, i * 2 + 1).load_scalar(fx); - let res = fx.bcx.ins().umax(lane1, lane2); - ret.place_lane(fx, 8 + i).to_ptr().store(fx, res, MemFlags::trusted()); - } - } - /* _ if intrinsic.starts_with("llvm.aarch64.neon.sshl.v") || intrinsic.starts_with("llvm.aarch64.neon.sqshl.v") diff --git a/src/intrinsics/mod.rs b/src/intrinsics/mod.rs index 58b31a534328e..bfeeb117ff5b3 100644 --- a/src/intrinsics/mod.rs +++ b/src/intrinsics/mod.rs @@ -132,6 +132,36 @@ fn simd_pair_for_each_lane<'tcx>( } } +fn simd_horizontal_pair_for_each_lane<'tcx>( + fx: &mut FunctionCx<'_, '_, 'tcx>, + x: CValue<'tcx>, + y: CValue<'tcx>, + ret: CPlace<'tcx>, + f: &dyn Fn(&mut FunctionCx<'_, '_, 'tcx>, Ty<'tcx>, Ty<'tcx>, Value, Value) -> Value, +) { + assert_eq!(x.layout(), y.layout()); + let layout = x.layout(); + + let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx); + let lane_layout = fx.layout_of(lane_ty); + let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx); + let ret_lane_layout = fx.layout_of(ret_lane_ty); + assert_eq!(lane_count, ret_lane_count); + + for lane_idx in 0..lane_count { + let src = if lane_idx < (lane_count / 2) { x } else { y }; + let src_idx = lane_idx % (lane_count / 2); + + let lhs_lane = src.value_lane(fx, src_idx * 2).load_scalar(fx); + let rhs_lane = src.value_lane(fx, src_idx * 2 + 1).load_scalar(fx); + + let res_lane = f(fx, lane_layout.ty, ret_lane_layout.ty, lhs_lane, rhs_lane); + let res_lane = CValue::by_val(res_lane, ret_lane_layout); + + ret.place_lane(fx, lane_idx).write_cvalue(fx, res_lane); + } +} + fn simd_trio_for_each_lane<'tcx>( fx: &mut FunctionCx<'_, '_, 'tcx>, x: CValue<'tcx>,