[Clang][SME2] Add multi-vector add/sub builtins #69725

kmclaughlin-arm · 2023-10-20T13:35:23Z

Adds the following SME2 builtins:

sv(add|sub)
sv(add|sub)_za32/za64,
sv(add|sub)_write_za32/za64

Other changes in this patch:

CGBuiltin.cpp: The GetAArch64SMEProcessedOperands function is created
to avoid duplicating existing code from EmitAArch64SVEBuiltinExpr.
arm_sve.td: The add/sub SME2 builtins which do not operate on ZA have
been added to arm_sve.td, matching the corrosponding LLVM IR intrinsic
names which start with @llvm.aarch64.sve for this reason.
SveEmitter.cpp: Adds the createCoreHeaderIntrinsics function to remove
duplicated code in createHeader & createSMEHeader. Uses a new enum
(ACLEKind) to choose either "_builtin_sme" or "_builtin_sve" when
emitting the intrinsics.

See https://github.com/ARM-software/acle/pull/217/files

llvmbot · 2023-10-20T13:36:04Z

@llvm/pr-subscribers-clang

@llvm/pr-subscribers-clang-codegen

Author: Kerry McLaughlin (kmclaughlin-arm)

Changes

Adds the following SME2 builtins:

sv(add|sub)
sv(add|sub)_za32/za64,
sv(add|sub)_write_za32/za64

Other changes in this patch:

CGBuiltin.cpp: The GetAArch64SMEProcessedOperands function is created
to avoid duplicating existing code from EmitAArch64SVEBuiltinExpr.
arm_sve.td: The add/sub SME2 builtins which do not operate on ZA have
been added to arm_sve.td, matching the corrosponding LLVM IR intrinsic
names which start with @llvm.aarch64.sve for this reason.
SveEmitter.cpp: Adds the createCoreHeaderIntrinsics function to remove
duplicated code in createHeader & createSMEHeader. Uses a new enum
(ACLEKind) to choose either "_builtin_sme" or "_builtin_sve" when
emitting the intrinsics.

See https://github.com/ARM-software/acle/pull/217/files

Patch is 405.53 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/69725.diff

14 Files Affected:

(modified) clang/include/clang/Basic/arm_sme.td (+35)
(modified) clang/include/clang/Basic/arm_sve.td (+9)
(modified) clang/include/clang/Basic/arm_sve_sme_incl.td (+1-1)
(modified) clang/lib/CodeGen/CGBuiltin.cpp (+68-62)
(modified) clang/lib/CodeGen/CodeGenFunction.h (+5)
(added) clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_add.c (+1226)
(added) clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_sub.c (+418)
(modified) clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2-bfloat.c (+18-18)
(modified) clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2.c (+178-178)
(modified) clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st3-bfloat.c (+22-22)
(modified) clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st3.c (+218-218)
(modified) clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st4-bfloat.c (+26-26)
(modified) clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st4.c (+258-258)
(modified) clang/utils/TableGen/SveEmitter.cpp (+43-62)

diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td
index 8d85327a86b1aaf..b5655afdf419ecf 100644
--- a/clang/include/clang/Basic/arm_sme.td
+++ b/clang/include/clang/Basic/arm_sme.td
@@ -263,3 +263,38 @@ multiclass ZAFPOuterProd<string n_suffix> {
 
 defm SVMOPA : ZAFPOuterProd<"mopa">;
 defm SVMOPS : ZAFPOuterProd<"mops">;
+
+////////////////////////////////////////////////////////////////////////////////
+// SME2 - ADD, SUB
+
+multiclass ZAAddSub<string n_suffix> {
+  let TargetGuard = "sme2" in {
+    def NAME # _WRITE_SINGLE_ZA32_VG1X2_I32 : Inst<"sv" # n_suffix # "_write[_single]_za32[_{d}]_vg1x2", "vm2d", "iUi", MergeNone, "aarch64_sme_" # n_suffix # "_write_single_za_vg1x2", [IsStreaming, IsSharedZA], []>;
+    def NAME # _WRITE_SINGLE_ZA32_VG1X4_I32 : Inst<"sv" # n_suffix # "_write[_single]_za32[_{d}]_vg1x4", "vm4d", "iUi", MergeNone, "aarch64_sme_" # n_suffix # "_write_single_za_vg1x4", [IsStreaming, IsSharedZA], []>;
+
+    def NAME # _WRITE_ZA32_VG1X2_I32 : Inst<"sv" # n_suffix # "_write_za32[_{d}]_vg1x2", "vm22", "iUi", MergeNone, "aarch64_sme_" # n_suffix # "_write_za_vg1x2", [IsStreaming, IsSharedZA], []>;
+    def NAME # _WRITE_ZA32_VG1X4_I32 : Inst<"sv" # n_suffix # "_write_za32[_{d}]_vg1x4", "vm44", "iUi", MergeNone, "aarch64_sme_" # n_suffix # "_write_za_vg1x4", [IsStreaming, IsSharedZA], []>;
+
+    def NAME # _ZA32_VG1x2_I32 : Inst<"sv" # n_suffix # "_za32[_{d}]_vg1x2", "vm2", "iUif", MergeNone, "aarch64_sme_" # n_suffix # "_za32_vg1x2", [IsStreaming, IsSharedZA], []>;
+    def NAME # _ZA32_VG1X4_I32 : Inst<"sv" # n_suffix # "_za32[_{d}]_vg1x4", "vm4", "iUif", MergeNone, "aarch64_sme_" # n_suffix # "_za32_vg1x4", [IsStreaming, IsSharedZA], []>;
+
+    let TargetGuard = "sme-i16i64" in {
+      def NAME # _WRITE_SINGLE_ZA64_VG1X2_I64 : Inst<"sv" # n_suffix # "_write[_single]_za64[_{d}]_vg1x2", "vm2d", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_write_single_za_vg1x2", [IsStreaming, IsSharedZA], []>;
+      def NAME # _WRITE_SINGLE_ZA64_VG1X4_I64 : Inst<"sv" # n_suffix # "_write[_single]_za64[_{d}]_vg1x4", "vm4d", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_write_single_za_vg1x4", [IsStreaming, IsSharedZA], []>;
+
+      def NAME # _WRITE_ZA64_VG1x2_I64 : Inst<"sv" # n_suffix # "_write_za64[_{d}]_vg1x2", "vm22", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_write_za_vg1x2", [IsStreaming, IsSharedZA], []>;
+      def NAME # _WRITE_ZA64_VG1x4_I64 : Inst<"sv" # n_suffix # "_write_za64[_{d}]_vg1x4", "vm44", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_write_za_vg1x4", [IsStreaming, IsSharedZA], []>;
+
+      def NAME # _ZA64_VG1X2_I64 : Inst<"sv" # n_suffix # "_za64[_{d}]_vg1x2", "vm2", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_za64_vg1x2", [IsStreaming, IsSharedZA], []>;
+      def NAME # _ZA64_VG1X4_I64 : Inst<"sv" # n_suffix # "_za64[_{d}]_vg1x4", "vm4", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_za64_vg1x4", [IsStreaming, IsSharedZA], []>;
+    }
+
+    let TargetGuard = "sme-f64f64" in {
+      def NAME # _ZA64_VG1X2_F64 : Inst<"sv" # n_suffix # "_za64[_{d}]_vg1x2", "vm2", "d", MergeNone, "aarch64_sme_" # n_suffix # "_za64_vg1x2", [IsStreaming, IsSharedZA], []>;
+      def NAME # _ZA64_VG1X4_F64 : Inst<"sv" # n_suffix # "_za64[_{d}]_vg1x4", "vm4", "d", MergeNone, "aarch64_sme_" # n_suffix # "_za64_vg1x4", [IsStreaming, IsSharedZA], []>;
+    }
+  }
+}
+
+defm SVADD : ZAAddSub<"add">;
+defm SVSUB : ZAAddSub<"sub">;
diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index 8f9bdd18829ff6b..a0b0ba4f2976052 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -1893,3 +1893,12 @@ def SVPSEL_D : SInst<"svpsel_lane_b64", "PPPm", "Pl", MergeNone, "", [], []>;
 
 def SVCNTP_COUNT : SInst<"svcntp_{d}", "n}i", "QcQsQiQl", MergeNone, "aarch64_sve_cntp_{d}", [IsOverloadNone], [ImmCheck<1, ImmCheck2_4_Mul2>]>;
 }
+
+////////////////////////////////////////////////////////////////////////////////
+// SME2
+
+let TargetGuard = "sme2" in {
+// == ADD (vectors) ==
+  def SVADD_SINGLE_X2 : SInst<"svadd[_single_{d}_x2]", "22d", "cUcsUsiUilUl", MergeNone, "aarch64_sve_add_single_x2", [IsStreaming], []>;
+  def SVADD_SINGLE_X4 : SInst<"svadd[_single_{d}_x4]", "44d", "cUcsUsiUilUl", MergeNone, "aarch64_sve_add_single_x4", [IsStreaming], []>;
+}
diff --git a/clang/include/clang/Basic/arm_sve_sme_incl.td b/clang/include/clang/Basic/arm_sve_sme_incl.td
index c3a6dc4e4d44abe..a484ed19efc7e0b 100644
--- a/clang/include/clang/Basic/arm_sve_sme_incl.td
+++ b/clang/include/clang/Basic/arm_sve_sme_incl.td
@@ -256,7 +256,7 @@ class ImmCheck<int arg, ImmCheckType kind, int eltSizeArg = -1> {
 }
 
 class Inst<string n, string p, string t, MergeType mt, string i,
-           list<FlagType> ft, list<ImmCheck> ch, MemEltType met> {
+           list<FlagType> ft, list<ImmCheck> ch, MemEltType met = MemEltTyDefault> {
   string Name = n;
   string Prototype = p;
   string Types = t;
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 2b341b8090fad7d..59a31505b8993fa 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -9571,22 +9571,17 @@ Value *CodeGenFunction::EmitSVEStructStore(const SVETypeFlags &TypeFlags,
   Value *BasePtr = Ops[1];
 
   // Does the store have an offset?
-  if (Ops.size() > 3)
+  if (Ops.size() > (2 + N))
     BasePtr = Builder.CreateGEP(VTy, BasePtr, Ops[2]);
 
-  Value *Val = Ops.back();
-
   // The llvm.aarch64.sve.st2/3/4 intrinsics take legal part vectors, so we
   // need to break up the tuple vector.
   SmallVector<llvm::Value*, 5> Operands;
-  unsigned MinElts = VTy->getElementCount().getKnownMinValue();
-  for (unsigned I = 0; I < N; ++I) {
-    Value *Idx = ConstantInt::get(CGM.Int64Ty, I * MinElts);
-    Operands.push_back(Builder.CreateExtractVector(VTy, Val, Idx));
-  }
+  for (unsigned I = Ops.size() - N; I < Ops.size(); ++I)
+    Operands.push_back(Ops[I]);
   Operands.append({Predicate, BasePtr});
-
   Function *F = CGM.getIntrinsic(IntID, { VTy });
+
   return Builder.CreateCall(F, Operands);
 }
 
@@ -9893,24 +9888,37 @@ Value *CodeGenFunction::FormSVEBuiltinResult(Value *Call) {
   return Call;
 }
 
-Value *CodeGenFunction::EmitAArch64SVEBuiltinExpr(unsigned BuiltinID,
-                                                  const CallExpr *E) {
+void CodeGenFunction::GetAArch64SMEProcessedOperands(
+    unsigned BuiltinID, const CallExpr *E, SmallVectorImpl<Value *> &Ops,
+    SVETypeFlags TypeFlags) {
   // Find out if any arguments are required to be integer constant expressions.
   unsigned ICEArguments = 0;
   ASTContext::GetBuiltinTypeError Error;
   getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
   assert(Error == ASTContext::GE_None && "Should not codegen an error");
 
-  llvm::Type *Ty = ConvertType(E->getType());
-  if (BuiltinID >= SVE::BI__builtin_sve_reinterpret_s8_s8 &&
-      BuiltinID <= SVE::BI__builtin_sve_reinterpret_f64_f64) {
-    Value *Val = EmitScalarExpr(E->getArg(0));
-    return EmitSVEReinterpret(Val, Ty);
-  }
+  bool IsTupleGetOrSet = TypeFlags.isTupleSet() || TypeFlags.isTupleGet();
 
-  llvm::SmallVector<Value *, 4> Ops;
   for (unsigned i = 0, e = E->getNumArgs(); i != e; i++) {
-    if ((ICEArguments & (1 << i)) == 0)
+    if (!IsTupleGetOrSet && (ICEArguments & (1 << i)) == 0) {
+      Value *Arg = EmitScalarExpr(E->getArg(i));
+      if (auto *VTy = dyn_cast<ScalableVectorType>(Arg->getType())) {
+        unsigned MinElts = VTy->getMinNumElements();
+        bool IsPred = VTy->getElementType()->isIntegerTy(1);
+        unsigned N =
+            (MinElts * VTy->getScalarSizeInBits()) / (IsPred ? 16 : 128);
+        for (unsigned I = 0; I < N; ++I) {
+          Value *Idx = ConstantInt::get(CGM.Int64Ty, (I * MinElts) / N);
+          auto *NewVTy =
+              ScalableVectorType::get(VTy->getElementType(), MinElts / N);
+          if (N == 1 && VTy == NewVTy)
+            Ops.push_back(Arg);
+          else
+            Ops.push_back(Builder.CreateExtractVector(NewVTy, Arg, Idx));
+        }
+      } else
+        Ops.push_back(Arg);
+    } else if ((ICEArguments & (1 << i)) == 0)
       Ops.push_back(EmitScalarExpr(E->getArg(i)));
     else {
       // If this is required to be a constant, constant fold it so that we know
@@ -9927,9 +9935,25 @@ Value *CodeGenFunction::EmitAArch64SVEBuiltinExpr(unsigned BuiltinID,
     }
   }
 
+  return;
+}
+
+Value *CodeGenFunction::EmitAArch64SVEBuiltinExpr(unsigned BuiltinID,
+                                                  const CallExpr *E) {
+  llvm::Type *Ty = ConvertType(E->getType());
+  if (BuiltinID >= SVE::BI__builtin_sve_reinterpret_s8_s8 &&
+      BuiltinID <= SVE::BI__builtin_sve_reinterpret_f64_f64) {
+    Value *Val = EmitScalarExpr(E->getArg(0));
+    return EmitSVEReinterpret(Val, Ty);
+  }
+
   auto *Builtin = findARMVectorIntrinsicInMap(AArch64SVEIntrinsicMap, BuiltinID,
                                               AArch64SVEIntrinsicsProvenSorted);
+
+  llvm::SmallVector<Value *, 4> Ops;
   SVETypeFlags TypeFlags(Builtin->TypeModifier);
+  GetAArch64SMEProcessedOperands(BuiltinID, E, Ops, TypeFlags);
+
   if (TypeFlags.isLoad())
     return EmitSVEMaskedLoad(E, Ty, Ops, Builtin->LLVMIntrinsic,
                              TypeFlags.isZExtReturn());
@@ -9943,10 +9967,10 @@ Value *CodeGenFunction::EmitAArch64SVEBuiltinExpr(unsigned BuiltinID,
     return EmitSVEPrefetchLoad(TypeFlags, Ops, Builtin->LLVMIntrinsic);
   else if (TypeFlags.isGatherPrefetch())
     return EmitSVEGatherPrefetch(TypeFlags, Ops, Builtin->LLVMIntrinsic);
-	else if (TypeFlags.isStructLoad())
-		return EmitSVEStructLoad(TypeFlags, Ops, Builtin->LLVMIntrinsic);
-	else if (TypeFlags.isStructStore())
-		return EmitSVEStructStore(TypeFlags, Ops, Builtin->LLVMIntrinsic);
+  else if (TypeFlags.isStructLoad())
+    return EmitSVEStructLoad(TypeFlags, Ops, Builtin->LLVMIntrinsic);
+  else if (TypeFlags.isStructStore())
+    return EmitSVEStructStore(TypeFlags, Ops, Builtin->LLVMIntrinsic);
   else if (TypeFlags.isTupleSet() || TypeFlags.isTupleGet())
         return EmitSVETupleSetOrGet(TypeFlags, Ty, Ops);
   else if (TypeFlags.isTupleCreate())
@@ -10202,13 +10226,8 @@ Value *CodeGenFunction::EmitAArch64SVEBuiltinExpr(unsigned BuiltinID,
   case SVE::BI__builtin_sve_svtbl2_f64: {
     SVETypeFlags TF(Builtin->TypeModifier);
     auto VTy = cast<llvm::ScalableVectorType>(getSVEType(TF));
-    Value *V0 = Builder.CreateExtractVector(VTy, Ops[0],
-                                            ConstantInt::get(CGM.Int64Ty, 0));
-    unsigned MinElts = VTy->getMinNumElements();
-    Value *V1 = Builder.CreateExtractVector(
-        VTy, Ops[0], ConstantInt::get(CGM.Int64Ty, MinElts));
     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_sve_tbl2, VTy);
-    return Builder.CreateCall(F, {V0, V1, Ops[1]});
+    return Builder.CreateCall(F, Ops);
   }
 
   case SVE::BI__builtin_sve_svset_neonq_s8:
@@ -10272,29 +10291,13 @@ Value *CodeGenFunction::EmitAArch64SMEBuiltinExpr(unsigned BuiltinID,
   getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
   assert(Error == ASTContext::GE_None && "Should not codegen an error");
 
-  llvm::Type *Ty = ConvertType(E->getType());
-  llvm::SmallVector<Value *, 4> Ops;
-  for (unsigned i = 0, e = E->getNumArgs(); i != e; i++) {
-    if ((ICEArguments & (1 << i)) == 0)
-      Ops.push_back(EmitScalarExpr(E->getArg(i)));
-    else {
-      // If this is required to be a constant, constant fold it so that we know
-      // that the generated intrinsic gets a ConstantInt.
-      std::optional<llvm::APSInt> Result =
-          E->getArg(i)->getIntegerConstantExpr(getContext());
-      assert(Result && "Expected argument to be a constant");
-
-      // Immediates for SVE llvm intrinsics are always 32bit.  We can safely
-      // truncate because the immediate has been range checked and no valid
-      // immediate requires more than a handful of bits.
-      *Result = Result->extOrTrunc(32);
-      Ops.push_back(llvm::ConstantInt::get(getLLVMContext(), *Result));
-    }
-  }
-
   auto *Builtin = findARMVectorIntrinsicInMap(AArch64SMEIntrinsicMap, BuiltinID,
                                               AArch64SMEIntrinsicsProvenSorted);
+
+  llvm::SmallVector<Value *, 4> Ops;
   SVETypeFlags TypeFlags(Builtin->TypeModifier);
+  GetAArch64SMEProcessedOperands(BuiltinID, E, Ops, TypeFlags);
+
   if (TypeFlags.isLoad() || TypeFlags.isStore())
     return EmitSMELd1St1(TypeFlags, Ops, Builtin->LLVMIntrinsic);
   else if (TypeFlags.isReadZA() || TypeFlags.isWriteZA())
@@ -10307,21 +10310,24 @@ Value *CodeGenFunction::EmitAArch64SMEBuiltinExpr(unsigned BuiltinID,
            BuiltinID == SME::BI__builtin_sme_svldr_za ||
            BuiltinID == SME::BI__builtin_sme_svstr_za)
     return EmitSMELdrStr(TypeFlags, Ops, Builtin->LLVMIntrinsic);
-  else if (Builtin->LLVMIntrinsic != 0) {
-    // Predicates must match the main datatype.
-    for (unsigned i = 0, e = Ops.size(); i != e; ++i)
-      if (auto PredTy = dyn_cast<llvm::VectorType>(Ops[i]->getType()))
-        if (PredTy->getElementType()->isIntegerTy(1))
-          Ops[i] = EmitSVEPredicateCast(Ops[i], getSVEType(TypeFlags));
 
-    Function *F = CGM.getIntrinsic(Builtin->LLVMIntrinsic,
-                                   getSVEOverloadTypes(TypeFlags, Ty, Ops));
-    Value *Call = Builder.CreateCall(F, Ops);
-    return Call;
-  }
+  // Should not happen!
+  if (Builtin->LLVMIntrinsic == 0)
+    return nullptr;
 
-  /// Should not happen
-  return nullptr;
+  // Predicates must match the main datatype.
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i)
+    if (auto PredTy = dyn_cast<llvm::VectorType>(Ops[i]->getType()))
+      if (PredTy->getElementType()->isIntegerTy(1))
+        Ops[i] = EmitSVEPredicateCast(Ops[i], getSVEType(TypeFlags));
+
+  Function *F =
+      TypeFlags.isOverloadNone()
+          ? CGM.getIntrinsic(Builtin->LLVMIntrinsic)
+          : CGM.getIntrinsic(Builtin->LLVMIntrinsic, {getSVEType(TypeFlags)});
+  Value *Call = Builder.CreateCall(F, Ops);
+
+  return FormSVEBuiltinResult(Call);
 }
 
 Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index e82115e2d706cf1..ed50b051e77f96e 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -4297,6 +4297,11 @@ class CodeGenFunction : public CodeGenTypeCache {
   /// the wider vector. This avoids the error when allocating space in llvm
   /// for struct of scalable vectors if a function returns struct.
   llvm::Value *FormSVEBuiltinResult(llvm::Value *Call);
+
+  void GetAArch64SMEProcessedOperands(unsigned BuiltinID, const CallExpr *E,
+                                      SmallVectorImpl<llvm::Value *> &Ops,
+                                      SVETypeFlags TypeFlags);
+
   llvm::Value *EmitAArch64SVEBuiltinExpr(unsigned BuiltinID, const CallExpr *E);
 
   llvm::Value *EmitSMELd1St1(const SVETypeFlags &TypeFlags,
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_add.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_add.c
new file mode 100644
index 000000000000000..18d7c66896d8b84
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_add.c
@@ -0,0 +1,1226 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
+#include <arm_sme_draft_spec_subject_to_change.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5
+#endif
+
+//
+// Single-Multi
+//
+
+// x2
+// CHECK-LABEL: @test_svadd_write_single2_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 7
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN]], i64 4)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv4i32(i32 [[ADD]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z28test_svadd_write_single2_s32j11svint32x2_tu11__SVInt32_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 7
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv4i32(i32 [[ADD]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svadd_write_single2_s32(uint32_t slice_base, svint32x2_t zn, svint32_t zm) {
+  SVE_ACLE_FUNC(svadd_write,_single,_za32,_s32,_vg1x2)(slice_base + 7, zn, zm);
+}
+
+// CHECK-LABEL: @test_svadd_write_single2_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 7
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN]], i64 4)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv4i32(i32 [[ADD]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z28test_svadd_write_single2_u32j12svuint32x2_tu12__SVUint32_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 7
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv4i32(i32 [[ADD]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svadd_write_single2_u32(uint32_t slice_base, svuint32x2_t zn, svuint32_t zm) {
+  SVE_ACLE_FUNC(svadd_write,_single,_za32,_u32,_vg1x2)(slice_base + 7, zn, zm);
+}
+
+// CHECK-LABEL: @test_svadd_write_single2_s64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 7
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZN:%.*]], i64 0)
+// CHECK-NEX...
[truncated]

github-actions · 2023-10-20T13:48:47Z

✅ With the latest revision this PR passed the C/C++ code formatter.

david-arm

I've not done an exhaustive review, but thought I'd leave the comments I have so far!

david-arm · 2023-10-20T14:28:22Z

clang/lib/CodeGen/CGBuiltin.cpp

  for (unsigned i = 0, e = E->getNumArgs(); i != e; i++) {
-    if ((ICEArguments & (1 << i)) == 0)
+    if (!IsTupleGetOrSet && (ICEArguments & (1 << i)) == 0) {


Perhaps you can create a temp variable and reuse it so it's a bit clearer, i.e.

bool IsICE = ICEArguments & (1 << i); if (!IsTupleGetOrSet && !IsICE) { ... } else if (!IsICE) { ...

david-arm · 2023-10-20T14:40:44Z

clang/lib/CodeGen/CGBuiltin.cpp

@@ -9893,24 +9888,37 @@ Value *CodeGenFunction::FormSVEBuiltinResult(Value *Call) {
  return Call;
 }

-Value *CodeGenFunction::EmitAArch64SVEBuiltinExpr(unsigned BuiltinID,
-                                                  const CallExpr *E) {
+void CodeGenFunction::GetAArch64SMEProcessedOperands(


I wonder if actually this is better named as GetAArch64SVEProcessedOperands because if we have to choose a name that's common to both the SME and SVE builtins, choosing SVE might make more sense. That's because we're specifically dealing with scalable vectors in general here and not something that's intrinsically linked to SME.

I think this makes sense, I've changed it to GetAArch64SVEProcessedOperands

david-arm · 2023-10-20T14:41:43Z

clang/lib/CodeGen/CGBuiltin.cpp

  for (unsigned i = 0, e = E->getNumArgs(); i != e; i++) {
-    if ((ICEArguments & (1 << i)) == 0)


Might be worth adding a comment explaining why we explicitly ignore tuple get/set functions?

Added the following comment:

// Tuple set/get only requires one insert/extract vector, which is // created by EmitSVETupleSetOrGet.

david-arm · 2023-10-20T14:43:45Z

clang/lib/CodeGen/CGBuiltin.cpp

@@ -10272,29 +10291,13 @@ Value *CodeGenFunction::EmitAArch64SMEBuiltinExpr(unsigned BuiltinID,
  getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);


Do we still need this code given we're now checking the ICE arguments in GetAArch64SMEProcessedOperands?

I've removed this, it wasn't needed here now that it's checked in GetAArch64SVEProcessedOperands.

david-arm · 2023-10-20T14:54:31Z

clang/utils/TableGen/SveEmitter.cpp

-     << (SMEAttrs.empty() ? "__builtin_sve_" : "__builtin_sme_")
-     << FullName << ")";
-  if (!SMEAttrs.empty())
-    OS << SMEAttrs;


It looks like we're no longer printing out the attributes for the builtin - is this because the attributes are dealt with explicitly elsewhere in clang and so they are no longer needed?

The attributes aren't currently handled elsewhere in Clang so this shouldn't have been removed. I've added most of this code back in, but kept the ACLEKind switch in below as this is needed when the builtin kind is SVE but it has SME attributes.

kmclaughlin-arm

Thank you for reviewing this @david-arm!

kmclaughlin-arm · 2023-10-24T16:06:31Z

clang/lib/CodeGen/CGBuiltin.cpp

@@ -9893,24 +9888,37 @@ Value *CodeGenFunction::FormSVEBuiltinResult(Value *Call) {
  return Call;
 }

-Value *CodeGenFunction::EmitAArch64SVEBuiltinExpr(unsigned BuiltinID,
-                                                  const CallExpr *E) {
+void CodeGenFunction::GetAArch64SMEProcessedOperands(


I think this makes sense, I've changed it to GetAArch64SVEProcessedOperands

kmclaughlin-arm · 2023-10-24T16:07:09Z

clang/lib/CodeGen/CGBuiltin.cpp

  for (unsigned i = 0, e = E->getNumArgs(); i != e; i++) {
-    if ((ICEArguments & (1 << i)) == 0)


Added the following comment:

// Tuple set/get only requires one insert/extract vector, which is // created by EmitSVETupleSetOrGet.

kmclaughlin-arm · 2023-10-24T16:07:58Z

clang/lib/CodeGen/CGBuiltin.cpp

@@ -10272,29 +10291,13 @@ Value *CodeGenFunction::EmitAArch64SMEBuiltinExpr(unsigned BuiltinID,
  getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);


I've removed this, it wasn't needed here now that it's checked in GetAArch64SVEProcessedOperands.

kmclaughlin-arm · 2023-10-24T16:10:53Z

clang/utils/TableGen/SveEmitter.cpp

-     << (SMEAttrs.empty() ? "__builtin_sve_" : "__builtin_sme_")
-     << FullName << ")";
-  if (!SMEAttrs.empty())
-    OS << SMEAttrs;


The attributes aren't currently handled elsewhere in Clang so this shouldn't have been removed. I've added most of this code back in, but kept the ACLEKind switch in below as this is needed when the builtin kind is SVE but it has SME attributes.

david-arm

This looks a lot better now @kmclaughlin-arm - thanks for the changes! I just have a couple of comments about the tests that I missed previously...

david-arm · 2023-10-25T15:47:18Z

clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_sub.c

+//
+void test_svsub_za64_vg1x4_u64(uint32_t slice_base, svuint64x4_t zn) {
+  SVE_ACLE_FUNC(svsub_za64,,_u64,,_vg1x4)(slice_base + 6, zn);
+}


It looks like this file is missing many tests compared to the add file. For example, I think we're missing the signed variants when looking at the changes to arm_sme.td.

david-arm · 2023-10-25T15:51:45Z

clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_add.c

+// CPP-CHECK-NEXT:    ret void
+//
+void test_svadd_za32_vg1x2_f32(uint32_t slice_base, svfloat32x2_t zn) {
+  SVE_ACLE_FUNC(svadd_za32,,_f32,,_vg1x2)(slice_base + 6, zn);


Perhaps it would be better to add 7 here instead, since that's the limit of the instruction? Having said that, since the LLVM intrinsics don't accept an immediate offset anyway maybe there isn't any value in adding immediate values for any of the add/sub tests?

david-arm

LGTM! Eccelente! Thanks for the changes @kmclaughlin-arm.

sdesmalen-arm · 2023-10-27T13:39:58Z

clang/include/clang/Basic/arm_sme.td

+////////////////////////////////////////////////////////////////////////////////
+// SME2 - ADD, SUB
+
+multiclass ZAAddSub<string n_suffix> {


nit:

Suggested change

multiclass ZAAddSub<string n_suffix> {

multiclass AddSubZA<string n_suffix> {

Hi @sdesmalen-arm, I'm happy to change this, though the reason I chose this name was to match other multiclasses in the file such as ZAStore, ZAAdd, ZAFPOuterProd, etc.

sdesmalen-arm · 2023-10-27T13:44:52Z

clang/lib/CodeGen/CGBuiltin.cpp

@@ -9571,22 +9571,17 @@ Value *CodeGenFunction::EmitSVEStructStore(const SVETypeFlags &TypeFlags,
  Value *BasePtr = Ops[1];

  // Does the store have an offset?
-  if (Ops.size() > 3)
+  if (Ops.size() > (2 + N))


EmitSVEStructStore does not handle any more intrinsics. Should this be part of a different patch?

This change was intended to be part of this patch. Now that EmitSVEStructLoad is being called after GetAArch64SVEProcessedOperands, the first operand will have been split into a series of vector extracts and the number of operands expected here will change depending on the value of N.

sdesmalen-arm · 2023-10-27T13:47:49Z

clang/lib/CodeGen/CGBuiltin.cpp

@@ -10266,35 +10288,13 @@ Value *CodeGenFunction::EmitAArch64SVEBuiltinExpr(unsigned BuiltinID,

 Value *CodeGenFunction::EmitAArch64SMEBuiltinExpr(unsigned BuiltinID,
                                                  const CallExpr *E) {
-  // Find out if any arguments are required to be integer constant expressions.


The changes to EmitAArch64SMEBuiltinExpr seem like a (significant) non-functional change. Can you separate it out as such so that it can be committed separately?

I've moved these changes to #70662 and I'll update this patch shortly to remove the non-functional changes.

Adds the following SME2 builtins: - sv(add|sub) - sv(add|sub)_za32/za64, - sv(add|sub)_write_za32/za64 Other changes in this patch: - SveEmitter.cpp: Adds the createCoreHeaderIntrinsics function to remove duplicated code in createHeader & createSMEHeader. Uses a new enum (ACLEKind) to choose either "__builtin_sme_" or "__builtin_sve_" when emitting the intrinsics. - CGBuiltin.cpp: The GetAArch64SMEProcessedOperands function is created to avoid duplicating existing code from EmitAArch64SVEBuiltinExpr. - arm_sve.td: The add/sub SME2 builtins which do not operate on ZA have been added to arm_sve.td, matching the corrosponding LLVM IR intrinsic names which start with @llvm.aarch64.sve for this reason. See https://github.com/ARM-software/acle/pull/217/files

…ands - Removed code checking for ICEArguments from EmitAArch64SMEBuiltinExpr - Added comment over check for IsTupleGetOrSet - Added the code to print SME attributes back into emitIntrinsic

- Added missing tests to acle_sme2_sub.c

clang/utils/TableGen/SveEmitter.cpp

clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_add.c

clang/include/clang/Basic/arm_sve.td

- Ran clang-format to fix formatting issues in SveEmitter.cpp - Added a comment to arm_sve.td to explain why some SME2 intrinsics are included

dtemirbulatov · 2023-11-06T13:29:20Z

clang/include/clang/Basic/arm_sve_sme_incl.td

@@ -257,7 +257,7 @@ class ImmCheck<int arg, ImmCheckType kind, int eltSizeArg = -1> {
 }

 class Inst<string n, string p, string t, MergeType mt, string i,
-           list<FlagType> ft, list<ImmCheck> ch, MemEltType met> {
+           list<FlagType> ft, list<ImmCheck> ch, MemEltType met = MemEltTyDefault> {


I am not sure about propose of this change "met = MemEltTyDefault"? The last parameter in multiclass ZAAddSub with Inst class usage is always []?

Hi @dtemirbulatov, this change is just to set the default MemEltType to MemEltTyDefault, so that we don't have to provide this in every builtin definition.

kmclaughlin-arm added clang Clang issues not falling into any other category clang:codegen labels Oct 20, 2023

kmclaughlin-arm requested review from dtemirbulatov, bryanpkc, sdesmalen-arm, CarolineConcatto, david-arm and MDevereau October 20, 2023 13:35

llvmbot added the clang:frontend Language frontend issues, e.g. anything involving "Sema" label Oct 20, 2023

david-arm reviewed Oct 20, 2023

View reviewed changes

kmclaughlin-arm commented Oct 24, 2023

View reviewed changes

david-arm reviewed Oct 25, 2023

View reviewed changes

david-arm approved these changes Oct 27, 2023

View reviewed changes

sdesmalen-arm reviewed Oct 27, 2023

View reviewed changes

kmclaughlin-arm force-pushed the sme2-builtins-add-sub branch from e8bf03a to 5d0b048 Compare October 30, 2023 16:02

kmclaughlin-arm added 5 commits November 2, 2023 15:53

- Rename GetAArch64SMEProcessedOperands to GetAArch64SVEProcessedOper…

2c64efe

…ands - Removed code checking for ICEArguments from EmitAArch64SMEBuiltinExpr - Added comment over check for IsTupleGetOrSet - Added the code to print SME attributes back into emitIntrinsic

- Removed the add of an immediate offset from the tests

3c770ac

- Added missing tests to acle_sme2_sub.c

- Removed incorrect tests added for multi-vector sub

08fefc9

- Run update_cc_test_checks script on add/sub tests

4cf4256

kmclaughlin-arm force-pushed the sme2-builtins-add-sub branch from 9961f3d to 4cf4256 Compare November 2, 2023 16:33

sdesmalen-arm reviewed Nov 6, 2023

View reviewed changes

kmclaughlin-arm added 2 commits November 6, 2023 16:40

- Add __arm_streaming and __arm_shared_za attributes to the tests

2bf7de4

- Moved the multi-single vector adds to a separate test file

65b7b66

- Ran clang-format to fix formatting issues in SveEmitter.cpp - Added a comment to arm_sve.td to explain why some SME2 intrinsics are included

dtemirbulatov reviewed Nov 7, 2023

View reviewed changes

sdesmalen-arm approved these changes Nov 7, 2023

View reviewed changes

kmclaughlin-arm merged commit 48fb8ee into llvm:main Nov 7, 2023
3 checks passed

maksfb mentioned this pull request Nov 8, 2023

fix empty bb maksfb/llvm-project#2

Closed

maksfb mentioned this pull request Nov 8, 2023

fix empty bb maksfb/llvm-project#3

Closed

kmclaughlin-arm deleted the sme2-builtins-add-sub branch November 8, 2023 10:32

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[Clang][SME2] Add multi-vector add/sub builtins #69725

[Clang][SME2] Add multi-vector add/sub builtins #69725

kmclaughlin-arm commented Oct 20, 2023

llvmbot commented Oct 20, 2023 •

edited

Loading

github-actions bot commented Oct 20, 2023 •

edited

Loading

david-arm left a comment

david-arm Oct 20, 2023

david-arm Oct 20, 2023

kmclaughlin-arm Oct 24, 2023

david-arm Oct 20, 2023

kmclaughlin-arm Oct 24, 2023

david-arm Oct 20, 2023

kmclaughlin-arm Oct 24, 2023

david-arm Oct 20, 2023

kmclaughlin-arm Oct 24, 2023

kmclaughlin-arm left a comment

kmclaughlin-arm Oct 24, 2023

kmclaughlin-arm Oct 24, 2023

kmclaughlin-arm Oct 24, 2023

kmclaughlin-arm Oct 24, 2023

david-arm left a comment

david-arm Oct 25, 2023

david-arm Oct 25, 2023

david-arm left a comment

sdesmalen-arm Oct 27, 2023

kmclaughlin-arm Oct 30, 2023

sdesmalen-arm Oct 27, 2023

kmclaughlin-arm Oct 30, 2023

sdesmalen-arm Oct 27, 2023

kmclaughlin-arm Oct 30, 2023

dtemirbulatov Nov 6, 2023

kmclaughlin-arm Nov 7, 2023

		for (unsigned i = 0, e = E->getNumArgs(); i != e; i++) {
		if ((ICEArguments & (1 << i)) == 0)

		@@ -10272,29 +10291,13 @@ Value *CodeGenFunction::EmitAArch64SMEBuiltinExpr(unsigned BuiltinID,
		getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);

	multiclass ZAAddSub<string n_suffix> {
	multiclass AddSubZA<string n_suffix> {

[Clang][SME2] Add multi-vector add/sub builtins #69725

[Clang][SME2] Add multi-vector add/sub builtins #69725

Conversation

kmclaughlin-arm commented Oct 20, 2023

llvmbot commented Oct 20, 2023 • edited Loading

github-actions bot commented Oct 20, 2023 • edited Loading

david-arm left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

kmclaughlin-arm left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

david-arm left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

david-arm left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

llvmbot commented Oct 20, 2023 •

edited

Loading

github-actions bot commented Oct 20, 2023 •

edited

Loading