Merged main:5abbf20f0fe5 into amd-gfx:2f1f2c28c29e

Local branch amd-gfx 2f1f2c2 Merged main:09f717b929ae into amd-gfx:4708702c57fa Remote branch main 5abbf20 [ARM] Additional test for Min loop. NFC
jaebaek · Dec 10, 2020 · 3083a61 · 3083a61
2 parents 2f1f2c2 + 5abbf20
commit 3083a61
Show file tree

Hide file tree

Showing 58 changed files with 2,728 additions and 149 deletions.
diff --git a/clang/include/clang/Basic/BuiltinsX86_64.def b/clang/include/clang/Basic/BuiltinsX86_64.def
@@ -100,6 +100,10 @@ TARGET_BUILTIN(__builtin_ia32_stui, "v", "n", "uintr")
 TARGET_BUILTIN(__builtin_ia32_testui, "Uc", "n", "uintr")
 TARGET_BUILTIN(__builtin_ia32_senduipi, "vUWi", "n", "uintr")
 
+// AMX internal builtin
+TARGET_BUILTIN(__builtin_ia32_tileloadd64_internal, "V256iUsUsvC*z", "n", "amx-tile")
+TARGET_BUILTIN(__builtin_ia32_tdpbssd_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8")
+TARGET_BUILTIN(__builtin_ia32_tilestored64_internal, "vUsUsv*zV256i", "n", "amx-tile")
 // AMX
 TARGET_BUILTIN(__builtin_ia32_tile_loadconfig, "vvC*", "n", "amx-tile")
 TARGET_BUILTIN(__builtin_ia32_tile_storeconfig, "vvC*", "n", "amx-tile")

diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
@@ -5142,8 +5142,11 @@ static EvalStmtResult EvaluateStmt(StmtResult &Result, EvalInfo &Info,
   case Stmt::ReturnStmtClass: {
     const Expr *RetExpr = cast<ReturnStmt>(S)->getRetValue();
     FullExpressionRAII Scope(Info);
-    if (RetExpr && RetExpr->isValueDependent())
-      return EvaluateDependentExpr(RetExpr, Info) ? ESR_Returned : ESR_Failed;
+    if (RetExpr && RetExpr->isValueDependent()) {
+      EvaluateDependentExpr(RetExpr, Info);
+      // We know we returned, but we don't know what the value is.
+      return ESR_Failed;
+    }
     if (RetExpr &&
         !(Result.Slot
               ? EvaluateInPlace(Result.Value, Info, *Result.Slot, RetExpr)

diff --git a/clang/lib/Headers/amxintrin.h b/clang/lib/Headers/amxintrin.h
@@ -15,8 +15,8 @@
 #define __AMXINTRIN_H
 #ifdef __x86_64__
 
-#define __DEFAULT_FN_ATTRS \
-  __attribute__((__always_inline__, __nodebug__,  __target__("amx-tile")))
+#define __DEFAULT_FN_ATTRS_TILE                                                \
+  __attribute__((__always_inline__, __nodebug__, __target__("amx-tile")))
 
 /// Load tile configuration from a 64-byte memory location specified by
 /// "mem_addr". The tile configuration includes the tile type palette, the
@@ -31,9 +31,8 @@
 ///
 /// \param __config
 ///    A pointer to 512-bits configuration
-static __inline__ void __DEFAULT_FN_ATTRS
-_tile_loadconfig(const void *__config)
-{
+static __inline__ void __DEFAULT_FN_ATTRS_TILE
+_tile_loadconfig(const void *__config) {
   __builtin_ia32_tile_loadconfig(__config);
 }
 
@@ -48,9 +47,8 @@ _tile_loadconfig(const void *__config)
 ///
 /// \param __config
 ///    A pointer to 512-bits configuration
-static __inline__ void __DEFAULT_FN_ATTRS
-_tile_storeconfig(void *__config)
-{
+static __inline__ void __DEFAULT_FN_ATTRS_TILE
+_tile_storeconfig(void *__config) {
   __builtin_ia32_tile_storeconfig(__config);
 }
 
@@ -60,9 +58,7 @@ _tile_storeconfig(void *__config)
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> TILERELEASE </c> instruction.
-static __inline__ void __DEFAULT_FN_ATTRS
-_tile_release(void)
-{
+static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
   __builtin_ia32_tilerelease();
 }
 
@@ -80,8 +76,9 @@ _tile_release(void)
 ///    A pointer to base address.
 /// \param stride
 ///    The stride between the rows' data to be loaded in memory.
-#define _tile_loadd(dst, base, stride) \
-  __builtin_ia32_tileloadd64((dst), ((const void *)(base)), (__SIZE_TYPE__)(stride))
+#define _tile_loadd(dst, base, stride)                                         \
+  __builtin_ia32_tileloadd64((dst), ((const void *)(base)),                    \
+                             (__SIZE_TYPE__)(stride))
 
 /// Load tile rows from memory specifieid by "base" address and "stride" into
 /// destination tile "dst" using the tile configuration previously configured
@@ -99,8 +96,9 @@ _tile_release(void)
 ///    A pointer to base address.
 /// \param stride
 ///    The stride between the rows' data to be loaded in memory.
-#define _tile_stream_loadd(dst, base, stride) \
-  __builtin_ia32_tileloaddt164((dst), ((const void *)(base)), (__SIZE_TYPE__)(stride))
+#define _tile_stream_loadd(dst, base, stride)                                  \
+  __builtin_ia32_tileloaddt164((dst), ((const void *)(base)),                  \
+                               (__SIZE_TYPE__)(stride))
 
 /// Store the tile specified by "src" to memory specifieid by "base" address and
 /// "stride" using the tile configuration previously configured via
@@ -116,7 +114,7 @@ _tile_release(void)
 ///    A pointer to base address.
 /// \param stride
 ///    The stride between the rows' data to be stored in memory.
-#define _tile_stored(dst, base, stride) \
+#define _tile_stored(dst, base, stride)                                        \
   __builtin_ia32_tilestored64((dst), ((void *)(base)), (__SIZE_TYPE__)(stride))
 
 /// Zero the tile specified by "tdest".
@@ -145,7 +143,8 @@ _tile_release(void)
 ///    The 1st source tile. Max size is 1024 Bytes.
 /// \param src1
 ///    The 2nd source tile. Max size is 1024 Bytes.
-#define _tile_dpbssd(dst, src0, src1) __builtin_ia32_tdpbssd((dst), (src0), (src1))
+#define _tile_dpbssd(dst, src0, src1)                                          \
+  __builtin_ia32_tdpbssd((dst), (src0), (src1))
 
 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
@@ -163,7 +162,8 @@ _tile_release(void)
 ///    The 1st source tile. Max size is 1024 Bytes.
 /// \param src1
 ///    The 2nd source tile. Max size is 1024 Bytes.
-#define _tile_dpbsud(dst, src0, src1) __builtin_ia32_tdpbsud((dst), (src0), (src1))
+#define _tile_dpbsud(dst, src0, src1)                                          \
+  __builtin_ia32_tdpbsud((dst), (src0), (src1))
 
 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
@@ -181,7 +181,8 @@ _tile_release(void)
 ///    The 1st source tile. Max size is 1024 Bytes.
 /// \param src1
 ///    The 2nd source tile. Max size is 1024 Bytes.
-#define _tile_dpbusd(dst, src0, src1) __builtin_ia32_tdpbusd((dst), (src0), (src1))
+#define _tile_dpbusd(dst, src0, src1)                                          \
+  __builtin_ia32_tdpbusd((dst), (src0), (src1))
 
 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
@@ -199,7 +200,8 @@ _tile_release(void)
 ///    The 1st source tile. Max size is 1024 Bytes.
 /// \param src1
 ///    The 2nd source tile. Max size is 1024 Bytes.
-#define _tile_dpbuud(dst, src0, src1) __builtin_ia32_tdpbuud((dst), (src0), (src1))
+#define _tile_dpbuud(dst, src0, src1)                                          \
+  __builtin_ia32_tdpbuud((dst), (src0), (src1))
 
 /// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
 /// src1, accumulating the intermediate single-precision (32-bit) floating-point
@@ -216,10 +218,56 @@ _tile_release(void)
 ///    The 1st source tile. Max size is 1024 Bytes.
 /// \param src1
 ///    The 2nd source tile. Max size is 1024 Bytes.
-#define _tile_dpbf16ps(dst, src0, src1) \
+#define _tile_dpbf16ps(dst, src0, src1)                                        \
   __builtin_ia32_tdpbf16ps((dst), (src0), (src1))
 
-#undef __DEFAULT_FN_ATTRS
+#define __DEFAULT_FN_ATTRS_INT8                                                \
+  __attribute__((__always_inline__, __nodebug__, __target__("amx-int8")))
+
+typedef int _tile1024i __attribute__((__vector_size__(1024), __aligned__(64)));
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
+_tile_loadd_internal(unsigned short m, unsigned short n, const void *base,
+                     __SIZE_TYPE__ stride) {
+  return __builtin_ia32_tileloadd64_internal(m, n, base,
+                                             (__SIZE_TYPE__)(stride));
+}
+
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
+_tile_dpbssd_internal(unsigned short m, unsigned short n, unsigned short k,
+                      _tile1024i dst, _tile1024i src1, _tile1024i src2) {
+  return __builtin_ia32_tdpbssd_internal(m, n, k, dst, src1, src2);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS_INT8
+_tile_stored_internal(unsigned short m, unsigned short n, void *base,
+                      __SIZE_TYPE__ stride, _tile1024i tile) {
+  return __builtin_ia32_tilestored64_internal(m, n, base,
+                                              (__SIZE_TYPE__)(stride), tile);
+}
+
+typedef struct __tile1024i_str {
+  const unsigned short row;
+  const unsigned short col;
+  _tile1024i tile;
+} __tile1024i;
+
+__DEFAULT_FN_ATTRS_INT8
+static void __tile_loadd(__tile1024i *dst, const void *base,
+                         __SIZE_TYPE__ stride) {
+  dst->tile = _tile_loadd_internal(dst->row, dst->col, base, stride);
+}
+
+__DEFAULT_FN_ATTRS_INT8
+static void __tile_dpbsud(__tile1024i *dst, __tile1024i src1,
+                          __tile1024i src2) {
+  dst->tile = _tile_dpbssd_internal(src1.row, src2.col, src1.col, dst->tile,
+                                    src1.tile, src2.tile);
+}
+
+__DEFAULT_FN_ATTRS_INT8
+static void __tile_stored(void *base, __SIZE_TYPE__ stride, __tile1024i src) {
+  _tile_stored_internal(src.row, src.col, base, stride, src.tile);
+}
 
 #endif /* __x86_64__ */
 #endif /* __AMXINTRIN_H */
diff --git a/clang/test/CodeGen/X86/amx_api.c b/clang/test/CodeGen/X86/amx_api.c
@@ -0,0 +1,51 @@
+// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown  -target-feature +avx512f  -target-feature +amx-int8  \
+// RUN: -target-feature +amx-bf16 -emit-llvm -o - -Werror -pedantic | FileCheck %s --check-prefixes=CHECK
+
+#include <immintrin.h>
+
+char buf[1024];
+#define STRIDE 32
+
+char buf2[1024];
+
+// This is an example code and integration test.
+void test_api(int cond, short row, short col) {
+  //CHECK-LABEL: @test_api
+  //CHECK: call <256 x i32> @llvm.x86.tileloadd64.internal
+  //CHECK: call <256 x i32> @llvm.x86.tdpbssd.internal
+  //CHECK: call void @llvm.x86.tilestored64.internal
+  __tile1024i a = {row, 8};
+  __tile1024i b = {8, col};
+  __tile1024i c = {row, col};
+
+  if (cond) {
+    __tile_loadd(&a, buf, STRIDE);
+    __tile_loadd(&b, buf, STRIDE);
+    __tile_loadd(&c, buf, STRIDE);
+  } else {
+    __tile_loadd(&a, buf2, STRIDE);
+    __tile_loadd(&b, buf2, STRIDE);
+    __tile_loadd(&c, buf2, STRIDE);
+  }
+  __tile_dpbsud(&c, a, b);
+  __tile_stored(buf, STRIDE, c);
+}
+
+void test_tile_loadd(short row, short col) {
+  //CHECK-LABEL: @test_tile_loadd
+  //CHECK: call <256 x i32> @llvm.x86.tileloadd64.internal
+  __tile1024i a = {row, col};
+  __tile_loadd(&a, buf, STRIDE);
+}
+
+void test_tile_dpbsud(__tile1024i a, __tile1024i b, __tile1024i c) {
+  //CHECK-LABEL: @test_tile_dpbsud
+  //CHECK: call <256 x i32> @llvm.x86.tdpbssd.internal
+  __tile_dpbsud(&c, a, b);
+}
+
+void test_tile_stored(__tile1024i c) {
+  //CHECK-LABEL: @test_tile_stored
+  //CHECK: call void @llvm.x86.tilestored64.internal
+  __tile_stored(buf, STRIDE, c);
+}
diff --git a/clang/test/SemaCXX/constexpr-function-recovery-crash.cpp b/clang/test/SemaCXX/constexpr-function-recovery-crash.cpp
@@ -66,3 +66,6 @@ template<int x> constexpr int f(int y) { // expected-note {{candidate template i
 constexpr int test9(int x) {
   return f<1>(f<x>(1)); // expected-error {{no matching function for call to 'f'}}
 }
+
+constexpr int test10() { return undef(); } // expected-error {{use of undeclared identifier 'undef'}}
+static_assert(test10() <= 1, "should not crash"); // expected-error {{static_assert expression is not an integral constant expression}}
diff --git a/lldb/test/API/lang/cpp/multiple-inheritance/TestCppMultipleInheritance.py b/lldb/test/API/lang/cpp/multiple-inheritance/TestCppMultipleInheritance.py
@@ -9,7 +9,7 @@ class TestCase(TestBase):
 
     def test(self):
         self.build()
-        lldbutil.run_to_source_breakpoint(self,"// break here", lldb.SBFileSpec("main.cpp"))
+        lldbutil.run_to_source_breakpoint(self, "// break here", lldb.SBFileSpec("main.cpp"))
 
         # Member access
         self.expect_expr("C.Base1::m_base", result_type="int", result_value="11")

diff --git a/llvm/include/llvm/CodeGen/LiveIntervalUnion.h b/llvm/include/llvm/CodeGen/LiveIntervalUnion.h
@@ -104,6 +104,9 @@ class LiveIntervalUnion {
   void verify(LiveVirtRegBitSet& VisitedVRegs);
 #endif
 
+  // Get any virtual register that is assign to this physical unit
+  LiveInterval *getOneVReg() const;
+
   /// Query interferences between a single live virtual register and a live
   /// interval union.
   class Query {

diff --git a/llvm/include/llvm/CodeGen/LiveRegMatrix.h b/llvm/include/llvm/CodeGen/LiveRegMatrix.h
@@ -153,6 +153,8 @@ class LiveRegMatrix : public MachineFunctionPass {
   /// Directly access the live interval unions per regunit.
   /// This returns an array indexed by the regunit number.
   LiveIntervalUnion *getLiveUnions() { return &Matrix[0]; }
+
+  Register getOneVReg(unsigned PhysReg) const;
 };
 
 } // end namespace llvm

diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h
@@ -485,6 +485,10 @@ namespace llvm {
   /// The pass fixups statepoint machine instruction to replace usage of
   /// caller saved registers with stack slots.
   extern char &FixupStatepointCallerSavedID;
+
+  /// The pass transform load/store <256 x i32> to AMX load/store intrinsics
+  /// or split the data to two <128 x i32>.
+  FunctionPass *createX86LowerAMXTypePass();
 } // End llvm namespace
 
 #endif
diff --git a/llvm/include/llvm/CodeGen/TileShapeInfo.h b/llvm/include/llvm/CodeGen/TileShapeInfo.h
@@ -0,0 +1,97 @@
+//===- llvm/CodeGen/TileShapeInfo.h - ---------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file Shape utility for AMX.
+/// AMX hardware requires to config the shape of tile data register before use.
+/// The 2D shape includes row and column. In AMX intrinsics interface the shape
+/// is passed as 1st and 2nd parameter and they are lowered as the 1st and 2nd
+/// machine operand of AMX pseudo instructions. ShapeT class is to facilitate
+/// tile config and register allocator. The row and column are machine operand
+/// of AMX pseudo instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_TILESHAPEINFO_H
+#define LLVM_CODEGEN_TILESHAPEINFO_H
+
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Register.h"
+#include <utility>
+
+namespace llvm {
+
+class ShapeT {
+public:
+  ShapeT(MachineOperand *Row, MachineOperand *Col,
+         const MachineRegisterInfo *MRI = nullptr)
+      : Row(Row), Col(Col) {
+    if (MRI)
+      deduceImm(MRI);
+  }
+  ShapeT()
+      : Row(nullptr), Col(nullptr), RowImm(InvalidImmShape),
+        ColImm(InvalidImmShape) {}
+  bool operator==(const ShapeT &Shape) {
+    MachineOperand *R = Shape.Row;
+    MachineOperand *C = Shape.Col;
+    if (!R || !C)
+      return false;
+    if (!Row || !Col)
+      return false;
+    if (Row->getReg() == R->getReg() && Col->getReg() == C->getReg())
+      return true;
+    if ((RowImm != InvalidImmShape) && (ColImm != InvalidImmShape))
+      return RowImm == Shape.getRowImm() && ColImm == Shape.getColImm();
+    return false;
+  }
+
+  bool operator!=(const ShapeT &Shape) { return !(*this == Shape); }
+
+  MachineOperand *getRow() const { return Row; }
+
+  MachineOperand *getCol() const { return Col; }
+
+  int64_t getRowImm() const { return RowImm; }
+
+  int64_t getColImm() const { return ColImm; }
+
+  bool isValid() { return (Row != nullptr) && (Col != nullptr); }
+
+  void deduceImm(const MachineRegisterInfo *MRI) {
+    // All def must be the same value, otherwise it is invalid MIs.
+    // Find the immediate.
+    // TODO copy propagation.
+    auto GetImm = [&](Register Reg) {
+      int64_t Imm = InvalidImmShape;
+      for (const MachineOperand &DefMO : MRI->def_operands(Reg)) {
+        const auto *MI = DefMO.getParent();
+        if (MI->isMoveImmediate()) {
+          Imm = MI->getOperand(1).getImm();
+          break;
+        }
+      }
+      return Imm;
+    };
+    RowImm = GetImm(Row->getReg());
+    ColImm = GetImm(Col->getReg());
+  }
+
+private:
+  static constexpr int64_t InvalidImmShape = -1;
+  MachineOperand *Row;
+  MachineOperand *Col;
+  int64_t RowImm;
+  int64_t ColImm;
+};
+
+} // namespace llvm
+
+#endif