Skip to content

Commit

Permalink
Merged main:5abbf20f0fe5 into amd-gfx:2f1f2c28c29e
Browse files Browse the repository at this point in the history
Local branch amd-gfx 2f1f2c2 Merged main:09f717b929ae into amd-gfx:4708702c57fa
Remote branch main 5abbf20 [ARM] Additional test for Min loop. NFC
  • Loading branch information
Sw authored and Sw committed Dec 10, 2020
2 parents 2f1f2c2 + 5abbf20 commit 3083a61
Show file tree
Hide file tree
Showing 58 changed files with 2,728 additions and 149 deletions.
4 changes: 4 additions & 0 deletions clang/include/clang/Basic/BuiltinsX86_64.def
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,10 @@ TARGET_BUILTIN(__builtin_ia32_stui, "v", "n", "uintr")
TARGET_BUILTIN(__builtin_ia32_testui, "Uc", "n", "uintr")
TARGET_BUILTIN(__builtin_ia32_senduipi, "vUWi", "n", "uintr")

// AMX internal builtin
TARGET_BUILTIN(__builtin_ia32_tileloadd64_internal, "V256iUsUsvC*z", "n", "amx-tile")
TARGET_BUILTIN(__builtin_ia32_tdpbssd_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8")
TARGET_BUILTIN(__builtin_ia32_tilestored64_internal, "vUsUsv*zV256i", "n", "amx-tile")
// AMX
TARGET_BUILTIN(__builtin_ia32_tile_loadconfig, "vvC*", "n", "amx-tile")
TARGET_BUILTIN(__builtin_ia32_tile_storeconfig, "vvC*", "n", "amx-tile")
Expand Down
7 changes: 5 additions & 2 deletions clang/lib/AST/ExprConstant.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5142,8 +5142,11 @@ static EvalStmtResult EvaluateStmt(StmtResult &Result, EvalInfo &Info,
case Stmt::ReturnStmtClass: {
const Expr *RetExpr = cast<ReturnStmt>(S)->getRetValue();
FullExpressionRAII Scope(Info);
if (RetExpr && RetExpr->isValueDependent())
return EvaluateDependentExpr(RetExpr, Info) ? ESR_Returned : ESR_Failed;
if (RetExpr && RetExpr->isValueDependent()) {
EvaluateDependentExpr(RetExpr, Info);
// We know we returned, but we don't know what the value is.
return ESR_Failed;
}
if (RetExpr &&
!(Result.Slot
? EvaluateInPlace(Result.Value, Info, *Result.Slot, RetExpr)
Expand Down
92 changes: 70 additions & 22 deletions clang/lib/Headers/amxintrin.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@
#define __AMXINTRIN_H
#ifdef __x86_64__

#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, __target__("amx-tile")))
#define __DEFAULT_FN_ATTRS_TILE \
__attribute__((__always_inline__, __nodebug__, __target__("amx-tile")))

/// Load tile configuration from a 64-byte memory location specified by
/// "mem_addr". The tile configuration includes the tile type palette, the
Expand All @@ -31,9 +31,8 @@
///
/// \param __config
/// A pointer to 512-bits configuration
static __inline__ void __DEFAULT_FN_ATTRS
_tile_loadconfig(const void *__config)
{
static __inline__ void __DEFAULT_FN_ATTRS_TILE
_tile_loadconfig(const void *__config) {
__builtin_ia32_tile_loadconfig(__config);
}

Expand All @@ -48,9 +47,8 @@ _tile_loadconfig(const void *__config)
///
/// \param __config
/// A pointer to 512-bits configuration
static __inline__ void __DEFAULT_FN_ATTRS
_tile_storeconfig(void *__config)
{
static __inline__ void __DEFAULT_FN_ATTRS_TILE
_tile_storeconfig(void *__config) {
__builtin_ia32_tile_storeconfig(__config);
}

Expand All @@ -60,9 +58,7 @@ _tile_storeconfig(void *__config)
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> TILERELEASE </c> instruction.
static __inline__ void __DEFAULT_FN_ATTRS
_tile_release(void)
{
static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
__builtin_ia32_tilerelease();
}

Expand All @@ -80,8 +76,9 @@ _tile_release(void)
/// A pointer to base address.
/// \param stride
/// The stride between the rows' data to be loaded in memory.
#define _tile_loadd(dst, base, stride) \
__builtin_ia32_tileloadd64((dst), ((const void *)(base)), (__SIZE_TYPE__)(stride))
#define _tile_loadd(dst, base, stride) \
__builtin_ia32_tileloadd64((dst), ((const void *)(base)), \
(__SIZE_TYPE__)(stride))

/// Load tile rows from memory specifieid by "base" address and "stride" into
/// destination tile "dst" using the tile configuration previously configured
Expand All @@ -99,8 +96,9 @@ _tile_release(void)
/// A pointer to base address.
/// \param stride
/// The stride between the rows' data to be loaded in memory.
#define _tile_stream_loadd(dst, base, stride) \
__builtin_ia32_tileloaddt164((dst), ((const void *)(base)), (__SIZE_TYPE__)(stride))
#define _tile_stream_loadd(dst, base, stride) \
__builtin_ia32_tileloaddt164((dst), ((const void *)(base)), \
(__SIZE_TYPE__)(stride))

/// Store the tile specified by "src" to memory specifieid by "base" address and
/// "stride" using the tile configuration previously configured via
Expand All @@ -116,7 +114,7 @@ _tile_release(void)
/// A pointer to base address.
/// \param stride
/// The stride between the rows' data to be stored in memory.
#define _tile_stored(dst, base, stride) \
#define _tile_stored(dst, base, stride) \
__builtin_ia32_tilestored64((dst), ((void *)(base)), (__SIZE_TYPE__)(stride))

/// Zero the tile specified by "tdest".
Expand Down Expand Up @@ -145,7 +143,8 @@ _tile_release(void)
/// The 1st source tile. Max size is 1024 Bytes.
/// \param src1
/// The 2nd source tile. Max size is 1024 Bytes.
#define _tile_dpbssd(dst, src0, src1) __builtin_ia32_tdpbssd((dst), (src0), (src1))
#define _tile_dpbssd(dst, src0, src1) \
__builtin_ia32_tdpbssd((dst), (src0), (src1))

/// Compute dot-product of bytes in tiles with a source/destination accumulator.
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
Expand All @@ -163,7 +162,8 @@ _tile_release(void)
/// The 1st source tile. Max size is 1024 Bytes.
/// \param src1
/// The 2nd source tile. Max size is 1024 Bytes.
#define _tile_dpbsud(dst, src0, src1) __builtin_ia32_tdpbsud((dst), (src0), (src1))
#define _tile_dpbsud(dst, src0, src1) \
__builtin_ia32_tdpbsud((dst), (src0), (src1))

/// Compute dot-product of bytes in tiles with a source/destination accumulator.
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
Expand All @@ -181,7 +181,8 @@ _tile_release(void)
/// The 1st source tile. Max size is 1024 Bytes.
/// \param src1
/// The 2nd source tile. Max size is 1024 Bytes.
#define _tile_dpbusd(dst, src0, src1) __builtin_ia32_tdpbusd((dst), (src0), (src1))
#define _tile_dpbusd(dst, src0, src1) \
__builtin_ia32_tdpbusd((dst), (src0), (src1))

/// Compute dot-product of bytes in tiles with a source/destination accumulator.
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
Expand All @@ -199,7 +200,8 @@ _tile_release(void)
/// The 1st source tile. Max size is 1024 Bytes.
/// \param src1
/// The 2nd source tile. Max size is 1024 Bytes.
#define _tile_dpbuud(dst, src0, src1) __builtin_ia32_tdpbuud((dst), (src0), (src1))
#define _tile_dpbuud(dst, src0, src1) \
__builtin_ia32_tdpbuud((dst), (src0), (src1))

/// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
/// src1, accumulating the intermediate single-precision (32-bit) floating-point
Expand All @@ -216,10 +218,56 @@ _tile_release(void)
/// The 1st source tile. Max size is 1024 Bytes.
/// \param src1
/// The 2nd source tile. Max size is 1024 Bytes.
#define _tile_dpbf16ps(dst, src0, src1) \
#define _tile_dpbf16ps(dst, src0, src1) \
__builtin_ia32_tdpbf16ps((dst), (src0), (src1))

#undef __DEFAULT_FN_ATTRS
#define __DEFAULT_FN_ATTRS_INT8 \
__attribute__((__always_inline__, __nodebug__, __target__("amx-int8")))

typedef int _tile1024i __attribute__((__vector_size__(1024), __aligned__(64)));
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
_tile_loadd_internal(unsigned short m, unsigned short n, const void *base,
__SIZE_TYPE__ stride) {
return __builtin_ia32_tileloadd64_internal(m, n, base,
(__SIZE_TYPE__)(stride));
}

static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
_tile_dpbssd_internal(unsigned short m, unsigned short n, unsigned short k,
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
return __builtin_ia32_tdpbssd_internal(m, n, k, dst, src1, src2);
}

static __inline__ void __DEFAULT_FN_ATTRS_INT8
_tile_stored_internal(unsigned short m, unsigned short n, void *base,
__SIZE_TYPE__ stride, _tile1024i tile) {
return __builtin_ia32_tilestored64_internal(m, n, base,
(__SIZE_TYPE__)(stride), tile);
}

typedef struct __tile1024i_str {
const unsigned short row;
const unsigned short col;
_tile1024i tile;
} __tile1024i;

__DEFAULT_FN_ATTRS_INT8
static void __tile_loadd(__tile1024i *dst, const void *base,
__SIZE_TYPE__ stride) {
dst->tile = _tile_loadd_internal(dst->row, dst->col, base, stride);
}

__DEFAULT_FN_ATTRS_INT8
static void __tile_dpbsud(__tile1024i *dst, __tile1024i src1,
__tile1024i src2) {
dst->tile = _tile_dpbssd_internal(src1.row, src2.col, src1.col, dst->tile,
src1.tile, src2.tile);
}

__DEFAULT_FN_ATTRS_INT8
static void __tile_stored(void *base, __SIZE_TYPE__ stride, __tile1024i src) {
_tile_stored_internal(src.row, src.col, base, stride, src.tile);
}

#endif /* __x86_64__ */
#endif /* __AMXINTRIN_H */
51 changes: 51 additions & 0 deletions clang/test/CodeGen/X86/amx_api.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown -target-feature +avx512f -target-feature +amx-int8 \
// RUN: -target-feature +amx-bf16 -emit-llvm -o - -Werror -pedantic | FileCheck %s --check-prefixes=CHECK

#include <immintrin.h>

char buf[1024];
#define STRIDE 32

char buf2[1024];

// This is an example code and integration test.
void test_api(int cond, short row, short col) {
//CHECK-LABEL: @test_api
//CHECK: call <256 x i32> @llvm.x86.tileloadd64.internal
//CHECK: call <256 x i32> @llvm.x86.tdpbssd.internal
//CHECK: call void @llvm.x86.tilestored64.internal
__tile1024i a = {row, 8};
__tile1024i b = {8, col};
__tile1024i c = {row, col};

if (cond) {
__tile_loadd(&a, buf, STRIDE);
__tile_loadd(&b, buf, STRIDE);
__tile_loadd(&c, buf, STRIDE);
} else {
__tile_loadd(&a, buf2, STRIDE);
__tile_loadd(&b, buf2, STRIDE);
__tile_loadd(&c, buf2, STRIDE);
}
__tile_dpbsud(&c, a, b);
__tile_stored(buf, STRIDE, c);
}

void test_tile_loadd(short row, short col) {
//CHECK-LABEL: @test_tile_loadd
//CHECK: call <256 x i32> @llvm.x86.tileloadd64.internal
__tile1024i a = {row, col};
__tile_loadd(&a, buf, STRIDE);
}

void test_tile_dpbsud(__tile1024i a, __tile1024i b, __tile1024i c) {
//CHECK-LABEL: @test_tile_dpbsud
//CHECK: call <256 x i32> @llvm.x86.tdpbssd.internal
__tile_dpbsud(&c, a, b);
}

void test_tile_stored(__tile1024i c) {
//CHECK-LABEL: @test_tile_stored
//CHECK: call void @llvm.x86.tilestored64.internal
__tile_stored(buf, STRIDE, c);
}
3 changes: 3 additions & 0 deletions clang/test/SemaCXX/constexpr-function-recovery-crash.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,3 +66,6 @@ template<int x> constexpr int f(int y) { // expected-note {{candidate template i
constexpr int test9(int x) {
return f<1>(f<x>(1)); // expected-error {{no matching function for call to 'f'}}
}

constexpr int test10() { return undef(); } // expected-error {{use of undeclared identifier 'undef'}}
static_assert(test10() <= 1, "should not crash"); // expected-error {{static_assert expression is not an integral constant expression}}
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ class TestCase(TestBase):

def test(self):
self.build()
lldbutil.run_to_source_breakpoint(self,"// break here", lldb.SBFileSpec("main.cpp"))
lldbutil.run_to_source_breakpoint(self, "// break here", lldb.SBFileSpec("main.cpp"))

# Member access
self.expect_expr("C.Base1::m_base", result_type="int", result_value="11")
Expand Down
3 changes: 3 additions & 0 deletions llvm/include/llvm/CodeGen/LiveIntervalUnion.h
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,9 @@ class LiveIntervalUnion {
void verify(LiveVirtRegBitSet& VisitedVRegs);
#endif

// Get any virtual register that is assign to this physical unit
LiveInterval *getOneVReg() const;

/// Query interferences between a single live virtual register and a live
/// interval union.
class Query {
Expand Down
2 changes: 2 additions & 0 deletions llvm/include/llvm/CodeGen/LiveRegMatrix.h
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,8 @@ class LiveRegMatrix : public MachineFunctionPass {
/// Directly access the live interval unions per regunit.
/// This returns an array indexed by the regunit number.
LiveIntervalUnion *getLiveUnions() { return &Matrix[0]; }

Register getOneVReg(unsigned PhysReg) const;
};

} // end namespace llvm
Expand Down
4 changes: 4 additions & 0 deletions llvm/include/llvm/CodeGen/Passes.h
Original file line number Diff line number Diff line change
Expand Up @@ -485,6 +485,10 @@ namespace llvm {
/// The pass fixups statepoint machine instruction to replace usage of
/// caller saved registers with stack slots.
extern char &FixupStatepointCallerSavedID;

/// The pass transform load/store <256 x i32> to AMX load/store intrinsics
/// or split the data to two <128 x i32>.
FunctionPass *createX86LowerAMXTypePass();
} // End llvm namespace

#endif
97 changes: 97 additions & 0 deletions llvm/include/llvm/CodeGen/TileShapeInfo.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
//===- llvm/CodeGen/TileShapeInfo.h - ---------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file Shape utility for AMX.
/// AMX hardware requires to config the shape of tile data register before use.
/// The 2D shape includes row and column. In AMX intrinsics interface the shape
/// is passed as 1st and 2nd parameter and they are lowered as the 1st and 2nd
/// machine operand of AMX pseudo instructions. ShapeT class is to facilitate
/// tile config and register allocator. The row and column are machine operand
/// of AMX pseudo instructions.
//
//===----------------------------------------------------------------------===//

#ifndef LLVM_CODEGEN_TILESHAPEINFO_H
#define LLVM_CODEGEN_TILESHAPEINFO_H

#include "llvm/ADT/DenseMapInfo.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/Register.h"
#include <utility>

namespace llvm {

class ShapeT {
public:
ShapeT(MachineOperand *Row, MachineOperand *Col,
const MachineRegisterInfo *MRI = nullptr)
: Row(Row), Col(Col) {
if (MRI)
deduceImm(MRI);
}
ShapeT()
: Row(nullptr), Col(nullptr), RowImm(InvalidImmShape),
ColImm(InvalidImmShape) {}
bool operator==(const ShapeT &Shape) {
MachineOperand *R = Shape.Row;
MachineOperand *C = Shape.Col;
if (!R || !C)
return false;
if (!Row || !Col)
return false;
if (Row->getReg() == R->getReg() && Col->getReg() == C->getReg())
return true;
if ((RowImm != InvalidImmShape) && (ColImm != InvalidImmShape))
return RowImm == Shape.getRowImm() && ColImm == Shape.getColImm();
return false;
}

bool operator!=(const ShapeT &Shape) { return !(*this == Shape); }

MachineOperand *getRow() const { return Row; }

MachineOperand *getCol() const { return Col; }

int64_t getRowImm() const { return RowImm; }

int64_t getColImm() const { return ColImm; }

bool isValid() { return (Row != nullptr) && (Col != nullptr); }

void deduceImm(const MachineRegisterInfo *MRI) {
// All def must be the same value, otherwise it is invalid MIs.
// Find the immediate.
// TODO copy propagation.
auto GetImm = [&](Register Reg) {
int64_t Imm = InvalidImmShape;
for (const MachineOperand &DefMO : MRI->def_operands(Reg)) {
const auto *MI = DefMO.getParent();
if (MI->isMoveImmediate()) {
Imm = MI->getOperand(1).getImm();
break;
}
}
return Imm;
};
RowImm = GetImm(Row->getReg());
ColImm = GetImm(Col->getReg());
}

private:
static constexpr int64_t InvalidImmShape = -1;
MachineOperand *Row;
MachineOperand *Col;
int64_t RowImm;
int64_t ColImm;
};

} // namespace llvm

#endif
Loading

0 comments on commit 3083a61

Please sign in to comment.