diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td index 8d85327a86b1aa..b5655afdf419ec 100644 --- a/clang/include/clang/Basic/arm_sme.td +++ b/clang/include/clang/Basic/arm_sme.td @@ -263,3 +263,38 @@ multiclass ZAFPOuterProd { defm SVMOPA : ZAFPOuterProd<"mopa">; defm SVMOPS : ZAFPOuterProd<"mops">; + +//////////////////////////////////////////////////////////////////////////////// +// SME2 - ADD, SUB + +multiclass ZAAddSub { + let TargetGuard = "sme2" in { + def NAME # _WRITE_SINGLE_ZA32_VG1X2_I32 : Inst<"sv" # n_suffix # "_write[_single]_za32[_{d}]_vg1x2", "vm2d", "iUi", MergeNone, "aarch64_sme_" # n_suffix # "_write_single_za_vg1x2", [IsStreaming, IsSharedZA], []>; + def NAME # _WRITE_SINGLE_ZA32_VG1X4_I32 : Inst<"sv" # n_suffix # "_write[_single]_za32[_{d}]_vg1x4", "vm4d", "iUi", MergeNone, "aarch64_sme_" # n_suffix # "_write_single_za_vg1x4", [IsStreaming, IsSharedZA], []>; + + def NAME # _WRITE_ZA32_VG1X2_I32 : Inst<"sv" # n_suffix # "_write_za32[_{d}]_vg1x2", "vm22", "iUi", MergeNone, "aarch64_sme_" # n_suffix # "_write_za_vg1x2", [IsStreaming, IsSharedZA], []>; + def NAME # _WRITE_ZA32_VG1X4_I32 : Inst<"sv" # n_suffix # "_write_za32[_{d}]_vg1x4", "vm44", "iUi", MergeNone, "aarch64_sme_" # n_suffix # "_write_za_vg1x4", [IsStreaming, IsSharedZA], []>; + + def NAME # _ZA32_VG1x2_I32 : Inst<"sv" # n_suffix # "_za32[_{d}]_vg1x2", "vm2", "iUif", MergeNone, "aarch64_sme_" # n_suffix # "_za32_vg1x2", [IsStreaming, IsSharedZA], []>; + def NAME # _ZA32_VG1X4_I32 : Inst<"sv" # n_suffix # "_za32[_{d}]_vg1x4", "vm4", "iUif", MergeNone, "aarch64_sme_" # n_suffix # "_za32_vg1x4", [IsStreaming, IsSharedZA], []>; + + let TargetGuard = "sme-i16i64" in { + def NAME # _WRITE_SINGLE_ZA64_VG1X2_I64 : Inst<"sv" # n_suffix # "_write[_single]_za64[_{d}]_vg1x2", "vm2d", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_write_single_za_vg1x2", [IsStreaming, IsSharedZA], []>; + def NAME # _WRITE_SINGLE_ZA64_VG1X4_I64 : Inst<"sv" # n_suffix # "_write[_single]_za64[_{d}]_vg1x4", "vm4d", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_write_single_za_vg1x4", [IsStreaming, IsSharedZA], []>; + + def NAME # _WRITE_ZA64_VG1x2_I64 : Inst<"sv" # n_suffix # "_write_za64[_{d}]_vg1x2", "vm22", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_write_za_vg1x2", [IsStreaming, IsSharedZA], []>; + def NAME # _WRITE_ZA64_VG1x4_I64 : Inst<"sv" # n_suffix # "_write_za64[_{d}]_vg1x4", "vm44", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_write_za_vg1x4", [IsStreaming, IsSharedZA], []>; + + def NAME # _ZA64_VG1X2_I64 : Inst<"sv" # n_suffix # "_za64[_{d}]_vg1x2", "vm2", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_za64_vg1x2", [IsStreaming, IsSharedZA], []>; + def NAME # _ZA64_VG1X4_I64 : Inst<"sv" # n_suffix # "_za64[_{d}]_vg1x4", "vm4", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_za64_vg1x4", [IsStreaming, IsSharedZA], []>; + } + + let TargetGuard = "sme-f64f64" in { + def NAME # _ZA64_VG1X2_F64 : Inst<"sv" # n_suffix # "_za64[_{d}]_vg1x2", "vm2", "d", MergeNone, "aarch64_sme_" # n_suffix # "_za64_vg1x2", [IsStreaming, IsSharedZA], []>; + def NAME # _ZA64_VG1X4_F64 : Inst<"sv" # n_suffix # "_za64[_{d}]_vg1x4", "vm4", "d", MergeNone, "aarch64_sme_" # n_suffix # "_za64_vg1x4", [IsStreaming, IsSharedZA], []>; + } + } +} + +defm SVADD : ZAAddSub<"add">; +defm SVSUB : ZAAddSub<"sub">; diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td index b5baafedd13960..3d4c2129565903 100644 --- a/clang/include/clang/Basic/arm_sve.td +++ b/clang/include/clang/Basic/arm_sve.td @@ -1980,3 +1980,15 @@ def SVCNTP_COUNT : SInst<"svcntp_{d}", "n}i", "QcQsQiQl", MergeNone, "aarch64_sv defm SVREVD : SInstZPZ<"svrevd", "csilUcUsUiUl", "aarch64_sve_revd">; } + +//////////////////////////////////////////////////////////////////////////////// +// SME2 + +// SME intrinsics which operate only on vectors and do not require ZA should be added here, +// as they could possibly become SVE instructions in the future. + +let TargetGuard = "sme2" in { +// == ADD (vectors) == + def SVADD_SINGLE_X2 : SInst<"svadd[_single_{d}_x2]", "22d", "cUcsUsiUilUl", MergeNone, "aarch64_sve_add_single_x2", [IsStreaming], []>; + def SVADD_SINGLE_X4 : SInst<"svadd[_single_{d}_x4]", "44d", "cUcsUsiUilUl", MergeNone, "aarch64_sve_add_single_x4", [IsStreaming], []>; +} diff --git a/clang/include/clang/Basic/arm_sve_sme_incl.td b/clang/include/clang/Basic/arm_sve_sme_incl.td index 3a7a5b51b25801..22a2a3c5434d65 100644 --- a/clang/include/clang/Basic/arm_sve_sme_incl.td +++ b/clang/include/clang/Basic/arm_sve_sme_incl.td @@ -257,7 +257,7 @@ class ImmCheck { } class Inst ft, list ch, MemEltType met> { + list ft, list ch, MemEltType met = MemEltTyDefault> { string Name = n; string Prototype = p; string Types = t; diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h index ee2c4b1e10afba..42f94c9b540191 100644 --- a/clang/lib/CodeGen/CodeGenFunction.h +++ b/clang/lib/CodeGen/CodeGenFunction.h @@ -4297,6 +4297,7 @@ class CodeGenFunction : public CodeGenTypeCache { /// the wider vector. This avoids the error when allocating space in llvm /// for struct of scalable vectors if a function returns struct. llvm::Value *FormSVEBuiltinResult(llvm::Value *Call); + llvm::Value *EmitAArch64SVEBuiltinExpr(unsigned BuiltinID, const CallExpr *E); llvm::Value *EmitSMELd1St1(const SVETypeFlags &TypeFlags, diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_add.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_add.c new file mode 100644 index 00000000000000..dd96dca70d6370 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_add.c @@ -0,0 +1,649 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py + +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5 +#endif + +// +// Single-Multi +// + +// x2 +// CHECK-LABEL: @test_svadd_write_single2_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svadd_write_single2_s32j11svint32x2_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_write_single2_s32(uint32_t slice_base, svint32x2_t zn, svint32_t zm) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svadd_write,_single,_za32,_s32,_vg1x2)(slice_base, zn, zm); +} + +// CHECK-LABEL: @test_svadd_write_single2_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svadd_write_single2_u32j12svuint32x2_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_write_single2_u32(uint32_t slice_base, svuint32x2_t zn, svuint32_t zm) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svadd_write,_single,_za32,_u32,_vg1x2)(slice_base, zn, zm); +} + +// CHECK-LABEL: @test_svadd_write_single2_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svadd_write_single2_s64j11svint64x2_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_write_single2_s64(uint32_t slice_base, svint64x2_t zn, svint64_t zm) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svadd_write,_single,_za64,_s64,_vg1x2)(slice_base, zn, zm); +} + +// CHECK-LABEL: @test_svadd_write_single2_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svadd_write_single2_u64j12svuint64x2_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_write_single2_u64(uint32_t slice_base, svuint64x2_t zn, svuint64_t zm) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svadd_write,_single,_za64,_u64,_vg1x2)(slice_base, zn, zm); +} + +// x4 + +// CHECK-LABEL: @test_svadd_write_single4_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svadd_write_single4_s32j11svint32x4_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_write_single4_s32(uint32_t slice_base, svint32x4_t zn, svint32_t zm) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svadd_write,_single,_za32,_s32,_vg1x4)(slice_base, zn, zm); +} + +// CHECK-LABEL: @test_svadd_write_single4_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svadd_write_single4_u32j12svuint32x4_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_write_single4_u32(uint32_t slice_base, svuint32x4_t zn, svuint32_t zm) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svadd_write,_single,_za32,_u32,_vg1x4)(slice_base, zn, zm); +} + +// CHECK-LABEL: @test_svadd_write_single4_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svadd_write_single4_s64j11svint64x4_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_write_single4_s64(uint32_t slice_base, svint64x4_t zn, svint64_t zm) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svadd_write,_single,_za64,_s64,_vg1x4)(slice_base, zn, zm); +} + +// CHECK-LABEL: @test_svadd_write_single4_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svadd_write_single4_u64j12svuint64x4_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_write_single4_u64(uint32_t slice_base, svuint64x4_t zn, svuint64_t zm) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svadd_write,_single,_za64,_u64,_vg1x4)(slice_base, zn, zm); +} + +// +// Multi-Multi +// + +// x2 + +// CHECK-LABEL: @test_svadd_write_multi2_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svadd_write_multi2_s32j11svint32x2_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_write_multi2_s32(uint32_t slice_base, svint32x2_t zn, svint32x2_t zm) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svadd_write,,_za32,_s32,_vg1x2)(slice_base, zn, zm); +} + +// CHECK-LABEL: @test_svadd_write_multi2_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svadd_write_multi2_u32j12svuint32x2_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_write_multi2_u32(uint32_t slice_base, svuint32x2_t zn, svuint32x2_t zm) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svadd_write,,_za32,_u32,_vg1x2)(slice_base, zn, zm); +} + +// CHECK-LABEL: @test_svadd_write_multi2_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svadd_write_multi2_s64j11svint64x2_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_write_multi2_s64(uint32_t slice_base, svint64x2_t zn, svint64x2_t zm) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svadd_write,,_za64,_s64,_vg1x2)(slice_base, zn, zm); +} + +// CHECK-LABEL: @test_svadd_write_multi2_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svadd_write_multi2_u64j12svuint64x2_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_write_multi2_u64(uint32_t slice_base, svuint64x2_t zn, svuint64x2_t zm) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svadd_write,,_za64,_u64,_vg1x2)(slice_base, zn, zm); +} + +// x4 + +// CHECK-LABEL: @test_svadd_write_multi4_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 12) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svadd_write_multi4_s32j11svint32x4_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 12) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_write_multi4_s32(uint32_t slice_base, svint32x4_t zn, svint32x4_t zm) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svadd_write,,_za32,_s32,_vg1x4)(slice_base, zn, zm); +} + +// CHECK-LABEL: @test_svadd_write_multi4_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 12) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svadd_write_multi4_u32j12svuint32x4_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 12) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_write_multi4_u32(uint32_t slice_base, svuint32x4_t zn, svuint32x4_t zm) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svadd_write,,_za32,_u32,_vg1x4)(slice_base, zn, zm); +} + +// CHECK-LABEL: @test_svadd_write_multi4_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 6) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svadd_write_multi4_s64j11svint64x4_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 6) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_write_multi4_s64(uint32_t slice_base, svint64x4_t zn, svint64x4_t zm) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svadd_write,,_za64,_s64,_vg1x4)(slice_base, zn, zm); +} + +// CHECK-LABEL: @test_svadd_write_multi4_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 6) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svadd_write_multi4_u64j12svuint64x4_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 6) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_write_multi4_u64(uint32_t slice_base, svuint64x4_t zn, svuint64x4_t zm) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svadd_write,,_za64,_u64,_vg1x4)(slice_base, zn, zm); +} + +// +// Accumulate to ZA +// + +// x2 + +// CHECK-LABEL: @test_svadd_za32_vg1x2_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4f32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svadd_za32_vg1x2_f32j13svfloat32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4f32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_za32_vg1x2_f32(uint32_t slice_base, svfloat32x2_t zn) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svadd_za32,,_f32,,_vg1x2)(slice_base, zn); +} + +// CHECK-LABEL: @test_svadd_za32_vg1x2_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svadd_za32_vg1x2_s32j11svint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_za32_vg1x2_s32(uint32_t slice_base, svint32x2_t zn) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svadd_za32,,_s32,,_vg1x2)(slice_base , zn); +} + +// CHECK-LABEL: @test_svadd_za32_vg1x2_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svadd_za32_vg1x2_u32j12svuint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_za32_vg1x2_u32(uint32_t slice_base, svuint32x2_t zn) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svadd_za32,,_u32,,_vg1x2)(slice_base, zn); +} + +// CHECK-LABEL: @test_svadd_za64_vg1x2_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN]], i64 2) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2f64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svadd_za64_vg1x2_f64j13svfloat64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2f64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_za64_vg1x2_f64(uint32_t slice_base, svfloat64x2_t zn) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svadd_za64,,_f64,,_vg1x2)(slice_base, zn); +} + +// CHECK-LABEL: @test_svadd_za64_vg1x2_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svadd_za64_vg1x2_s64j11svint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_za64_vg1x2_s64(uint32_t slice_base, svint64x2_t zn) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svadd_za64,,_s64,,_vg1x2)(slice_base, zn); +} + +// CHECK-LABEL: @test_svadd_za64_vg1x2_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svadd_za64_vg1x2_u64j12svuint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_za64_vg1x2_u64(uint32_t slice_base, svuint64x2_t zn) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svadd_za64,,_u64,,_vg1x2)(slice_base, zn); +} + +// x4 + +// CHECK-LABEL: @test_svadd_za32_vg1x4_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svadd_za32_vg1x4_f32j13svfloat32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_za32_vg1x4_f32(uint32_t slice_base, svfloat32x4_t zn) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svadd_za32,,_f32,,_vg1x4)(slice_base, zn); +} + +// CHECK-LABEL: @test_svadd_za32_vg1x4_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svadd_za32_vg1x4_s32j11svint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_za32_vg1x4_s32(uint32_t slice_base, svint32x4_t zn) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svadd_za32,,_s32,,_vg1x4)(slice_base, zn); +} + +// CHECK-LABEL: @test_svadd_za32_vg1x4_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svadd_za32_vg1x4_u32j12svuint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_za32_vg1x4_u32(uint32_t slice_base, svuint32x4_t zn) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svadd_za32,,_u32,,_vg1x4)(slice_base, zn); +} + +// CHECK-LABEL: @test_svadd_za64_vg1x4_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 6) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2f64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svadd_za64_vg1x4_f64j13svfloat64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2f64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_za64_vg1x4_f64(uint32_t slice_base, svfloat64x4_t zn) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svadd_za64,,_f64,,_vg1x4)(slice_base, zn); +} + +// CHECK-LABEL: @test_svadd_za64_vg1x4_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svadd_za64_vg1x4_s64j11svint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_za64_vg1x4_s64(uint32_t slice_base, svint64x4_t zn) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svadd_za64,,_s64,,_vg1x4)(slice_base, zn); +} + +// CHECK-LABEL: @test_svadd_za64_vg1x4_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svadd_za64_vg1x4_u64j12svuint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_za64_vg1x4_u64(uint32_t slice_base, svuint64x4_t zn) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svadd_za64,,_u64,,_vg1x4)(slice_base, zn); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_sub.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_sub.c new file mode 100644 index 00000000000000..9570deab0b3919 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_sub.c @@ -0,0 +1,649 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py + +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5 +#endif + +// +// Single-Multi +// + +// x2 +// CHECK-LABEL: @test_svsub_write_single2_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svsub_write_single2_s32j11svint32x2_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_write_single2_s32(uint32_t slice_base, svint32x2_t zn, svint32_t zm) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svsub_write,_single,_za32,_s32,_vg1x2)(slice_base, zn, zm); +} + +// CHECK-LABEL: @test_svsub_write_single2_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svsub_write_single2_u32j12svuint32x2_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_write_single2_u32(uint32_t slice_base, svuint32x2_t zn, svuint32_t zm) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svsub_write,_single,_za32,_u32,_vg1x2)(slice_base, zn, zm); +} + +// CHECK-LABEL: @test_svsub_write_single2_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svsub_write_single2_s64j11svint64x2_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_write_single2_s64(uint32_t slice_base, svint64x2_t zn, svint64_t zm) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svsub_write,_single,_za64,_s64,_vg1x2)(slice_base, zn, zm); +} + +// CHECK-LABEL: @test_svsub_write_single2_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svsub_write_single2_u64j12svuint64x2_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_write_single2_u64(uint32_t slice_base, svuint64x2_t zn, svuint64_t zm) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svsub_write,_single,_za64,_u64,_vg1x2)(slice_base, zn, zm); +} + +// x4 + +// CHECK-LABEL: @test_svsub_write_single4_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svsub_write_single4_s32j11svint32x4_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_write_single4_s32(uint32_t slice_base, svint32x4_t zn, svint32_t zm) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svsub_write,_single,_za32,_s32,_vg1x4)(slice_base, zn, zm); +} + +// CHECK-LABEL: @test_svsub_write_single4_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svsub_write_single4_u32j12svuint32x4_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_write_single4_u32(uint32_t slice_base, svuint32x4_t zn, svuint32_t zm) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svsub_write,_single,_za32,_u32,_vg1x4)(slice_base, zn, zm); +} + +// CHECK-LABEL: @test_svsub_write_single4_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svsub_write_single4_s64j11svint64x4_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_write_single4_s64(uint32_t slice_base, svint64x4_t zn, svint64_t zm) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svsub_write,_single,_za64,_s64,_vg1x4)(slice_base, zn, zm); +} + +// CHECK-LABEL: @test_svsub_write_single4_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svsub_write_single4_u64j12svuint64x4_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_write_single4_u64(uint32_t slice_base, svuint64x4_t zn, svuint64_t zm) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svsub_write,_single,_za64,_u64,_vg1x4)(slice_base, zn, zm); +} + +// +// Multi-Multi +// + +// x2 + +// CHECK-LABEL: @test_svsub_write_multi2_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svsub_write_multi2_s32j11svint32x2_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_write_multi2_s32(uint32_t slice_base, svint32x2_t zn, svint32x2_t zm) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svsub_write,,_za32,_s32,_vg1x2)(slice_base, zn, zm); +} + +// CHECK-LABEL: @test_svsub_write_multi2_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svsub_write_multi2_u32j12svuint32x2_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_write_multi2_u32(uint32_t slice_base, svuint32x2_t zn, svuint32x2_t zm) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svsub_write,,_za32,_u32,_vg1x2)(slice_base, zn, zm); +} + +// CHECK-LABEL: @test_svsub_write_multi2_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svsub_write_multi2_s64j11svint64x2_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_write_multi2_s64(uint32_t slice_base, svint64x2_t zn, svint64x2_t zm) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svsub_write,,_za64,_s64,_vg1x2)(slice_base, zn, zm); +} + +// CHECK-LABEL: @test_svsub_write_multi2_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svsub_write_multi2_u64j12svuint64x2_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_write_multi2_u64(uint32_t slice_base, svuint64x2_t zn, svuint64x2_t zm) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svsub_write,,_za64,_u64,_vg1x2)(slice_base, zn, zm); +} + +// x4 + +// CHECK-LABEL: @test_svsub_write_multi4_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 12) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svsub_write_multi4_s32j11svint32x4_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 12) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_write_multi4_s32(uint32_t slice_base, svint32x4_t zn, svint32x4_t zm) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svsub_write,,_za32,_s32,_vg1x4)(slice_base, zn, zm); +} + +// CHECK-LABEL: @test_svsub_write_multi4_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 12) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svsub_write_multi4_u32j12svuint32x4_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 12) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_write_multi4_u32(uint32_t slice_base, svuint32x4_t zn, svuint32x4_t zm) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svsub_write,,_za32,_u32,_vg1x4)(slice_base, zn, zm); +} + +// CHECK-LABEL: @test_svsub_write_multi4_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 6) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svsub_write_multi4_s64j11svint64x4_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 6) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_write_multi4_s64(uint32_t slice_base, svint64x4_t zn, svint64x4_t zm) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svsub_write,,_za64,_s64,_vg1x4)(slice_base, zn, zm); +} + +// CHECK-LABEL: @test_svsub_write_multi4_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 6) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svsub_write_multi4_u64j12svuint64x4_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 6) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_write_multi4_u64(uint32_t slice_base, svuint64x4_t zn, svuint64x4_t zm) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svsub_write,,_za64,_u64,_vg1x4)(slice_base, zn, zm); +} + +// +// Accumulate to ZA +// + +// x2 + +// CHECK-LABEL: @test_svsub_za32_vg1x2_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za32.vg1x2.nxv4f32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svsub_za32_vg1x2_f32j13svfloat32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za32.vg1x2.nxv4f32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_za32_vg1x2_f32(uint32_t slice_base, svfloat32x2_t zn) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svsub_za32,,_f32,,_vg1x2)(slice_base, zn); +} + +// CHECK-LABEL: @test_svsub_za32_vg1x2_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za32.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svsub_za32_vg1x2_s32j11svint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za32.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_za32_vg1x2_s32(uint32_t slice_base, svint32x2_t zn) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svsub_za32,,_s32,,_vg1x2)(slice_base , zn); +} + +// CHECK-LABEL: @test_svsub_za32_vg1x2_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za32.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svsub_za32_vg1x2_u32j12svuint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za32.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_za32_vg1x2_u32(uint32_t slice_base, svuint32x2_t zn) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svsub_za32,,_u32,,_vg1x2)(slice_base, zn); +} + +// CHECK-LABEL: @test_svsub_za64_vg1x2_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN]], i64 2) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za64.vg1x2.nxv2f64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svsub_za64_vg1x2_f64j13svfloat64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za64.vg1x2.nxv2f64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_za64_vg1x2_f64(uint32_t slice_base, svfloat64x2_t zn) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svsub_za64,,_f64,,_vg1x2)(slice_base, zn); +} + +// CHECK-LABEL: @test_svsub_za64_vg1x2_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za64.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svsub_za64_vg1x2_s64j11svint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za64.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_za64_vg1x2_s64(uint32_t slice_base, svint64x2_t zn) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svsub_za64,,_s64,,_vg1x2)(slice_base, zn); +} + +// CHECK-LABEL: @test_svsub_za64_vg1x2_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za64.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svsub_za64_vg1x2_u64j12svuint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za64.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_za64_vg1x2_u64(uint32_t slice_base, svuint64x2_t zn) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svsub_za64,,_u64,,_vg1x2)(slice_base, zn); +} + +// x4 + +// CHECK-LABEL: @test_svsub_za32_vg1x4_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za32.vg1x4.nxv4f32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svsub_za32_vg1x4_f32j13svfloat32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za32.vg1x4.nxv4f32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_za32_vg1x4_f32(uint32_t slice_base, svfloat32x4_t zn) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svsub_za32,,_f32,,_vg1x4)(slice_base, zn); +} + +// CHECK-LABEL: @test_svsub_za32_vg1x4_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za32.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svsub_za32_vg1x4_s32j11svint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za32.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_za32_vg1x4_s32(uint32_t slice_base, svint32x4_t zn) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svsub_za32,,_s32,,_vg1x4)(slice_base, zn); +} + +// CHECK-LABEL: @test_svsub_za32_vg1x4_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za32.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svsub_za32_vg1x4_u32j12svuint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za32.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_za32_vg1x4_u32(uint32_t slice_base, svuint32x4_t zn) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svsub_za32,,_u32,,_vg1x4)(slice_base, zn); +} + +// CHECK-LABEL: @test_svsub_za64_vg1x4_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 6) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za64.vg1x4.nxv2f64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svsub_za64_vg1x4_f64j13svfloat64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za64.vg1x4.nxv2f64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_za64_vg1x4_f64(uint32_t slice_base, svfloat64x4_t zn) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svsub_za64,,_f64,,_vg1x4)(slice_base, zn); +} + +// CHECK-LABEL: @test_svsub_za64_vg1x4_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za64.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svsub_za64_vg1x4_s64j11svint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za64.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_za64_vg1x4_s64(uint32_t slice_base, svint64x4_t zn) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svsub_za64,,_s64,,_vg1x4)(slice_base, zn); +} + +// CHECK-LABEL: @test_svsub_za64_vg1x4_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za64.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svsub_za64_vg1x4_u64j12svuint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za64.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_za64_vg1x4_u64(uint32_t slice_base, svuint64x4_t zn) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svsub_za64,,_u64,,_vg1x4)(slice_base, zn); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_add.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_add.c new file mode 100644 index 00000000000000..85c4b9b09546b3 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_add.c @@ -0,0 +1,539 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py + +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5 +#endif + +// +// Multi-Single Vector +// + +// x2 + +// CHECK-LABEL: @test_svadd_vector_single2_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv16i8( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z28test_svadd_vector_single2_s810svint8x2_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv16i8( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svint8x2_t test_svadd_vector_single2_s8(svint8x2_t zn, svint8_t zm) __arm_streaming { + return SVE_ACLE_FUNC(svadd,_single_s8_x2,,,)(zn, zm); +} + +// CHECK-LABEL: @test_svadd_vector_single2_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv16i8( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z28test_svadd_vector_single2_u811svuint8x2_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv16i8( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svuint8x2_t test_svadd_vector_single2_u8(svuint8x2_t zn, svuint8_t zm) __arm_streaming { + return SVE_ACLE_FUNC(svadd,_single_u8_x2,,,)(zn, zm); +} + +// CHECK-LABEL: @test_svadd_vector_single2_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv8i16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single2_s1611svint16x2_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv8i16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svint16x2_t test_svadd_vector_single2_s16(svint16x2_t zn, svint16_t zm) __arm_streaming { + return SVE_ACLE_FUNC(svadd,_single_s16_x2,,,)(zn, zm); +} + +// CHECK-LABEL: @test_svadd_vector_single2_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv8i16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single2_u1612svuint16x2_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv8i16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svuint16x2_t test_svadd_vector_single2_u16(svuint16x2_t zn, svuint16_t zm) __arm_streaming { + return SVE_ACLE_FUNC(svadd,_single_u16_x2,,,)(zn, zm); +} + +// CHECK-LABEL: @test_svadd_vector_single2_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv4i32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single2_s3211svint32x2_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv4i32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svint32x2_t test_svadd_vector_single2_s32(svint32x2_t zn, svint32_t zm) __arm_streaming { + return SVE_ACLE_FUNC(svadd,_single_s32_x2,,,)(zn, zm); +} + +// CHECK-LABEL: @test_svadd_vector_single2_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv4i32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single2_u3212svuint32x2_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv4i32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svuint32x2_t test_svadd_vector_single2_u32(svuint32x2_t zn, svuint32_t zm) __arm_streaming { + return SVE_ACLE_FUNC(svadd,_single_u32_x2,,,)(zn, zm); +} + +// CHECK-LABEL: @test_svadd_vector_single2_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv2i64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single2_s6411svint64x2_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv2i64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svint64x2_t test_svadd_vector_single2_s64(svint64x2_t zn, svint64_t zm) __arm_streaming { + return SVE_ACLE_FUNC(svadd,_single_s64_x2,,,)(zn, zm); +} + +// CHECK-LABEL: @test_svadd_vector_single2_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv2i64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single2_u6412svuint64x2_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv2i64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svuint64x2_t test_svadd_vector_single2_u64(svuint64x2_t zn, svuint64_t zm) __arm_streaming { + return SVE_ACLE_FUNC(svadd,_single_u64_x2,,,)(zn, zm); +} + + +// x4 + +// CHECK-LABEL: @test_svadd_vector_single4_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP8]], [[TMP9]], i64 32) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 48) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z28test_svadd_vector_single4_s810svint8x4_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP8]], [[TMP9]], i64 32) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svint8x4_t test_svadd_vector_single4_s8(svint8x4_t zn, svint8_t zm) __arm_streaming { + return SVE_ACLE_FUNC(svadd,_single_s8_x4,,,)(zn, zm); +} + +// CHECK-LABEL: @test_svadd_vector_single4_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP8]], [[TMP9]], i64 32) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 48) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z28test_svadd_vector_single4_u811svuint8x4_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP8]], [[TMP9]], i64 32) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svuint8x4_t test_svadd_vector_single4_u8(svuint8x4_t zn, svuint8_t zm) __arm_streaming { + return SVE_ACLE_FUNC(svadd,_single_u8_x4,,,)(zn, zm); +} + +// CHECK-LABEL: @test_svadd_vector_single4_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 16) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 24) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single4_s1611svint16x4_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 16) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svint16x4_t test_svadd_vector_single4_s16(svint16x4_t zn, svint16_t zm) __arm_streaming { + return SVE_ACLE_FUNC(svadd,_single_s16_x4,,,)(zn, zm); +} + +// CHECK-LABEL: @test_svadd_vector_single4_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 16) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 24) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single4_u1612svuint16x4_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 16) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svuint16x4_t test_svadd_vector_single4_u16(svuint16x4_t zn, svuint16_t zm) __arm_streaming { + return SVE_ACLE_FUNC(svadd,_single_u16_x4,,,)(zn, zm); +} + +// CHECK-LABEL: @test_svadd_vector_single4_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single4_s3211svint32x4_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svint32x4_t test_svadd_vector_single4_s32(svint32x4_t zn, svint32_t zm) __arm_streaming { + return SVE_ACLE_FUNC(svadd,_single_s32_x4,,,)(zn, zm); +} + +// CHECK-LABEL: @test_svadd_vector_single4_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single4_u3212svuint32x4_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svuint32x4_t test_svadd_vector_single4_u32(svuint32x4_t zn, svuint32_t zm) __arm_streaming { + return SVE_ACLE_FUNC(svadd,_single_u32_x4,,,)(zn, zm); +} + +// CHECK-LABEL: @test_svadd_vector_single4_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 4) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 6) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single4_s6411svint64x4_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 4) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svint64x4_t test_svadd_vector_single4_s64(svint64x4_t zn, svint64_t zm) __arm_streaming { + return SVE_ACLE_FUNC(svadd,_single_s64_x4,,,)(zn, zm); +} + +// CHECK-LABEL: @test_svadd_vector_single4_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 4) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 6) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single4_u6412svuint64x4_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 4) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svuint64x4_t test_svadd_vector_single4_u64(svuint64x4_t zn, svuint64_t zm) __arm_streaming { + return SVE_ACLE_FUNC(svadd,_single_u64_x4,,,)(zn, zm); +} diff --git a/clang/utils/TableGen/SveEmitter.cpp b/clang/utils/TableGen/SveEmitter.cpp index 368908e79bf196..96253c70932773 100644 --- a/clang/utils/TableGen/SveEmitter.cpp +++ b/clang/utils/TableGen/SveEmitter.cpp @@ -43,6 +43,8 @@ enum ClassKind { ClassG, // Overloaded name without type suffix }; +enum class ACLEKind { SVE, SME }; + using TypeSpec = std::string; namespace { @@ -246,7 +248,7 @@ class Intrinsic { } /// Emits the intrinsic declaration to the ostream. - void emitIntrinsic(raw_ostream &OS, SVEEmitter &Emitter) const; + void emitIntrinsic(raw_ostream &OS, SVEEmitter &Emitter, ACLEKind Kind) const; private: std::string getMergeSuffix() const { return MergeSuffix; } @@ -354,6 +356,10 @@ class SVEEmitter { /// Emit arm_sve.h. void createHeader(raw_ostream &o); + // Emits core intrinsics in both arm_sme.h and arm_sve.h + void createCoreHeaderIntrinsics(raw_ostream &o, SVEEmitter &Emitter, + ACLEKind Kind); + /// Emit all the __builtin prototypes and code needed by Sema. void createBuiltins(raw_ostream &o); @@ -1023,7 +1029,8 @@ std::string Intrinsic::mangleName(ClassKind LocalCK) const { getMergeSuffix(); } -void Intrinsic::emitIntrinsic(raw_ostream &OS, SVEEmitter &Emitter) const { +void Intrinsic::emitIntrinsic(raw_ostream &OS, SVEEmitter &Emitter, + ACLEKind Kind) const { bool IsOverloaded = getClassKind() == ClassG && getProto().size() > 1; std::string FullName = mangleName(ClassS); @@ -1040,9 +1047,17 @@ void Intrinsic::emitIntrinsic(raw_ostream &OS, SVEEmitter &Emitter) const { SMEAttrs += ", arm_preserves_za"; OS << (IsOverloaded ? "__aio " : "__ai ") - << "__attribute__((__clang_arm_builtin_alias(" - << (SMEAttrs.empty() ? "__builtin_sve_" : "__builtin_sme_") - << FullName << ")"; + << "__attribute__((__clang_arm_builtin_alias("; + + switch (Kind) { + case ACLEKind::SME: + OS << "__builtin_sme_" << FullName << ")"; + break; + case ACLEKind::SVE: + OS << "__builtin_sve_" << FullName << ")"; + break; + } + if (!SMEAttrs.empty()) OS << SMEAttrs; OS << "))\n"; @@ -1180,6 +1195,34 @@ void SVEEmitter::createIntrinsic( } } +void SVEEmitter::createCoreHeaderIntrinsics(raw_ostream &OS, + SVEEmitter &Emitter, + ACLEKind Kind) { + SmallVector, 128> Defs; + std::vector RV = Records.getAllDerivedDefinitions("Inst"); + for (auto *R : RV) + createIntrinsic(R, Defs); + + // Sort intrinsics in header file by following order/priority: + // - Architectural guard (i.e. does it require SVE2 or SVE2_AES) + // - Class (is intrinsic overloaded or not) + // - Intrinsic name + std::stable_sort(Defs.begin(), Defs.end(), + [](const std::unique_ptr &A, + const std::unique_ptr &B) { + auto ToTuple = [](const std::unique_ptr &I) { + return std::make_tuple(I->getGuard(), + (unsigned)I->getClassKind(), + I->getName()); + }; + return ToTuple(A) < ToTuple(B); + }); + + // Actually emit the intrinsic declarations. + for (auto &I : Defs) + I->emitIntrinsic(OS, Emitter, Kind); +} + void SVEEmitter::createHeader(raw_ostream &OS) { OS << "/*===---- arm_sve.h - ARM SVE intrinsics " "-----------------------------------===\n" @@ -1331,27 +1374,7 @@ void SVEEmitter::createHeader(raw_ostream &OS) { << To.Suffix << "(__VA_ARGS__)\n"; } - SmallVector, 128> Defs; - std::vector RV = Records.getAllDerivedDefinitions("Inst"); - for (auto *R : RV) - createIntrinsic(R, Defs); - - // Sort intrinsics in header file by following order/priority: - // - Architectural guard (i.e. does it require SVE2 or SVE2_AES) - // - Class (is intrinsic overloaded or not) - // - Intrinsic name - std::stable_sort( - Defs.begin(), Defs.end(), [](const std::unique_ptr &A, - const std::unique_ptr &B) { - auto ToTuple = [](const std::unique_ptr &I) { - return std::make_tuple(I->getGuard(), (unsigned)I->getClassKind(), I->getName()); - }; - return ToTuple(A) < ToTuple(B); - }); - - // Actually emit the intrinsic declarations. - for (auto &I : Defs) - I->emitIntrinsic(OS, *this); + createCoreHeaderIntrinsics(OS, *this, ACLEKind::SVE); OS << "#define svcvtnt_bf16_x svcvtnt_bf16_m\n"; OS << "#define svcvtnt_bf16_f32_x svcvtnt_bf16_f32_m\n"; @@ -1533,30 +1556,7 @@ void SVEEmitter::createSMEHeader(raw_ostream &OS) { OS << "extern \"C\" {\n"; OS << "#endif\n\n"; - SmallVector, 128> Defs; - std::vector RV = Records.getAllDerivedDefinitions("Inst"); - for (auto *R : RV) - createIntrinsic(R, Defs); - - // Sort intrinsics in header file by following order/priority similar to SVE: - // - Architectural guard - // - Class (is intrinsic overloaded or not) - // - Intrinsic name - std::stable_sort(Defs.begin(), Defs.end(), - [](const std::unique_ptr &A, - const std::unique_ptr &B) { - auto ToTuple = [](const std::unique_ptr &I) { - return std::make_tuple(I->getGuard(), - (unsigned)I->getClassKind(), - I->getName()); - }; - return ToTuple(A) < ToTuple(B); - }); - - // Actually emit the intrinsic declaration. - for (auto &I : Defs) { - I->emitIntrinsic(OS, *this); - } + createCoreHeaderIntrinsics(OS, *this, ACLEKind::SME); OS << "#ifdef __cplusplus\n"; OS << "} // extern \"C\"\n";