Skip to content

Commit

Permalink
[AMDGPU][GFX12] VOP encoding and codegen - add support for v_cvt fp8/… (
Browse files Browse the repository at this point in the history
llvm#78414)

…bf8 instructions

    Add VOP1, VOP1_DPP8, VOP1_DPP16, VOP3, VOP3_DPP8, VOP3_DPP16
    instructions that were supported on GFX940 (MI300):
    - V_CVT_F32_FP8
    - V_CVT_F32_BF8
    - V_CVT_PK_F32_FP8
    - V_CVT_PK_F32_BF8
    - V_CVT_PK_FP8_F32
    - V_CVT_PK_BF8_F32
    - V_CVT_SR_FP8_F32
    - V_CVT_SR_BF8_F32

---------

Co-authored-by: Mateja Marjanovic <mateja.marjanovic@amd.com>
Co-authored-by: Mirko Brkušanin <Mirko.Brkusanin@amd.com>
Change-Id: I62e37982868d9f5b400bf794b82c59ae530080ed
  • Loading branch information
3 people authored and David Salinas committed Feb 9, 2024
1 parent f75c933 commit 9545c24
Show file tree
Hide file tree
Showing 37 changed files with 1,821 additions and 202 deletions.
4 changes: 2 additions & 2 deletions clang/test/CodeGenOpenCL/amdgpu-features.cl
Original file line number Diff line number Diff line change
Expand Up @@ -100,8 +100,8 @@
// GFX1103: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
// GFX1150: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
// GFX1151: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
// GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
// GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
// GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
// GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"

// GFX1103-W64: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize64"

Expand Down
35 changes: 18 additions & 17 deletions clang/test/CodeGenOpenCL/builtins-amdgcn-fp8.cl
Original file line number Diff line number Diff line change
@@ -1,59 +1,60 @@
// REQUIRES: amdgpu-registered-target
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx940 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX940
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx940 -S -emit-llvm -o - %s | FileCheck %s
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -S -emit-llvm -o - %s | FileCheck %s

typedef float v2f __attribute__((ext_vector_type(2)));

// CHECK-GFX940-LABEL: @test_cvt_f32_bf8
// CHECK-GFX940: call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 0)
// CHECK-LABEL: @test_cvt_f32_bf8
// CHECK: call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 0)
void test_cvt_f32_bf8(global int* out, int a)
{
*out = __builtin_amdgcn_cvt_f32_bf8(a, 0);
}

// CHECK-GFX940-LABEL: @test_cvt_f32_fp8
// CHECK-GFX940: call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 1)
// CHECK-LABEL: @test_cvt_f32_fp8
// CHECK: call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 1)
void test_cvt_f32_fp8(global int* out, int a)
{
*out = __builtin_amdgcn_cvt_f32_fp8(a, 1);
}

// CHECK-GFX940-LABEL: @test_cvt_pk_f32_bf8
// CHECK-GFX940: call <2 x float> @llvm.amdgcn.cvt.pk.f32.bf8(i32 %a, i1 false)
// CHECK-LABEL: @test_cvt_pk_f32_bf8
// CHECK: call <2 x float> @llvm.amdgcn.cvt.pk.f32.bf8(i32 %a, i1 false)
void test_cvt_pk_f32_bf8(global v2f* out, int a)
{
*out = __builtin_amdgcn_cvt_pk_f32_bf8(a, false);
}

// CHECK-GFX940-LABEL: @test_cvt_pk_f32_fp8
// CHECK-GFX940: call <2 x float> @llvm.amdgcn.cvt.pk.f32.fp8(i32 %a, i1 true)
// CHECK-LABEL: @test_cvt_pk_f32_fp8
// CHECK: call <2 x float> @llvm.amdgcn.cvt.pk.f32.fp8(i32 %a, i1 true)
void test_cvt_pk_f32_fp8(global v2f* out, int a)
{
*out = __builtin_amdgcn_cvt_pk_f32_fp8(a, true);
}

// CHECK-GFX940-LABEL: @test_cvt_pk_bf8_f32
// CHECK-GFX940: call i32 @llvm.amdgcn.cvt.pk.bf8.f32(float %a, float %b, i32 %old, i1 false)
// CHECK-LABEL: @test_cvt_pk_bf8_f32
// CHECK: call i32 @llvm.amdgcn.cvt.pk.bf8.f32(float %a, float %b, i32 %old, i1 false)
void test_cvt_pk_bf8_f32(global int* out, int old, float a, float b)
{
*out = __builtin_amdgcn_cvt_pk_bf8_f32(a, b, old, false);
}

// CHECK-GFX940-LABEL: @test_cvt_pk_fp8_f32
// CHECK-GFX940: call i32 @llvm.amdgcn.cvt.pk.fp8.f32(float %a, float %b, i32 %old, i1 true)
// CHECK-LABEL: @test_cvt_pk_fp8_f32
// CHECK: call i32 @llvm.amdgcn.cvt.pk.fp8.f32(float %a, float %b, i32 %old, i1 true)
void test_cvt_pk_fp8_f32(global int* out, int old, float a, float b)
{
*out = __builtin_amdgcn_cvt_pk_fp8_f32(a, b, old, true);
}

// CHECK-GFX940-LABEL: @test_cvt_sr_bf8_f32
// CHECK-GFX940: call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %a, i32 %b, i32 %old, i32 2)
// CHECK-LABEL: @test_cvt_sr_bf8_f32
// CHECK: call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %a, i32 %b, i32 %old, i32 2)
void test_cvt_sr_bf8_f32(global int* out, int old, float a, int b)
{
*out = __builtin_amdgcn_cvt_sr_bf8_f32(a, b, old, 2);
}

// CHECK-GFX940-LABEL: @test_cvt_sr_fp8_f32
// CHECK-GFX940: call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %a, i32 %b, i32 %old, i32 3)
// CHECK-LABEL: @test_cvt_sr_fp8_f32
// CHECK: call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %a, i32 %b, i32 %old, i32 3)
void test_cvt_sr_fp8_f32(global int* out, int old, float a, int b)
{
*out = __builtin_amdgcn_cvt_sr_fp8_f32(a, b, old, 3);
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/AMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -1507,6 +1507,7 @@ def FeatureISAVersion12 : FeatureSet<
FeatureFlatAtomicFaddF32Inst,
FeatureImageInsts,
FeatureExtendedImageInsts,
FeatureFP8ConversionInsts,
FeaturePackedTID,
FeatureVcmpxPermlaneHazard,
FeatureSALUFloatInsts,
Expand Down
31 changes: 29 additions & 2 deletions llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3530,6 +3530,9 @@ bool AMDGPUAsmParser::usesConstantBus(const MCInst &Inst, unsigned OpIdx) {
return !isInlineConstant(Inst, OpIdx);
} else if (MO.isReg()) {
auto Reg = MO.getReg();
if (!Reg) {
return false;
}
const MCRegisterInfo *TRI = getContext().getRegisterInfo();
auto PReg = mc2PseudoReg(Reg);
return isSGPR(PReg, TRI) && PReg != SGPR_NULL;
Expand Down Expand Up @@ -8427,12 +8430,20 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
const bool IsPacked = (Desc.TSFlags & SIInstrFlags::IsPacked) != 0;

if (Opc == AMDGPU::V_CVT_SR_BF8_F32_vi ||
Opc == AMDGPU::V_CVT_SR_FP8_F32_vi) {
Opc == AMDGPU::V_CVT_SR_FP8_F32_vi ||
Opc == AMDGPU::V_CVT_SR_BF8_F32_e64_gfx12 ||
Opc == AMDGPU::V_CVT_SR_FP8_F32_e64_gfx12) {
Inst.addOperand(MCOperand::createImm(0)); // Placeholder for src2_mods
Inst.addOperand(Inst.getOperand(0));
}

if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vdst_in)) {
// Adding vdst_in operand is already covered for these DPP instructions in
// cvtVOP3DPP.
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vdst_in) &&
!(Opc == AMDGPU::V_CVT_PK_BF8_F32_e64_dpp_gfx12 ||
Opc == AMDGPU::V_CVT_PK_FP8_F32_e64_dpp_gfx12 ||
Opc == AMDGPU::V_CVT_PK_BF8_F32_e64_dpp8_gfx12 ||
Opc == AMDGPU::V_CVT_PK_FP8_F32_e64_dpp8_gfx12)) {
assert(!IsPacked);
Inst.addOperand(Inst.getOperand(0));
}
Expand Down Expand Up @@ -8933,6 +8944,22 @@ void AMDGPUAsmParser::cvtVOP3DPP(MCInst &Inst, const OperandVector &Operands,
}
}

int VdstInIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
if (VdstInIdx == static_cast<int>(Inst.getNumOperands())) {
Inst.addOperand(Inst.getOperand(0));
}

bool IsVOP3CvtSrDpp = Opc == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp8_gfx12 ||
Opc == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp8_gfx12 ||
Opc == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp_gfx12 ||
Opc == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp_gfx12;
if (IsVOP3CvtSrDpp) {
if (Src2ModIdx == static_cast<int>(Inst.getNumOperands())) {
Inst.addOperand(MCOperand::createImm(0));
Inst.addOperand(MCOperand::createReg(0));
}
}

auto TiedTo = Desc.getOperandConstraint(Inst.getNumOperands(),
MCOI::TIED_TO);
if (TiedTo != -1) {
Expand Down
26 changes: 26 additions & 0 deletions llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -720,6 +720,13 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
AMDGPU::OpName::src2_modifiers);
}

if (Res && (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp ||
MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp)) {
// Insert dummy unused src2_modifiers.
insertNamedMCOperand(MI, MCOperand::createImm(0),
AMDGPU::OpName::src2_modifiers);
}

if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::DS) &&
!AMDGPU::hasGDS(STI)) {
insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::gds);
Expand Down Expand Up @@ -950,6 +957,7 @@ void AMDGPUDisassembler::convertMacDPPInst(MCInst &MI) const {
// first add optional MI operands to check FI
DecodeStatus AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const {
unsigned Opc = MI.getOpcode();

if (MCII->get(Opc).TSFlags & SIInstrFlags::VOP3P) {
convertVOP3PDPPInst(MI);
} else if ((MCII->get(Opc).TSFlags & SIInstrFlags::VOPC) ||
Expand All @@ -959,6 +967,15 @@ DecodeStatus AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const {
if (isMacDPP(MI))
convertMacDPPInst(MI);

int VDstInIdx =
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst_in);
if (VDstInIdx != -1)
insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::vdst_in);

if (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp8_gfx12 ||
MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp8_gfx12)
insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::src2);

unsigned DescNumOps = MCII->get(Opc).getNumOperands();
if (MI.getNumOperands() < DescNumOps &&
AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel)) {
Expand All @@ -985,6 +1002,15 @@ DecodeStatus AMDGPUDisassembler::convertVOP3DPPInst(MCInst &MI) const {
if (isMacDPP(MI))
convertMacDPPInst(MI);

int VDstInIdx =
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst_in);
if (VDstInIdx != -1)
insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::vdst_in);

if (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp_gfx12 ||
MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp_gfx12)
insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::src2);

unsigned Opc = MI.getOpcode();
unsigned DescNumOps = MCII->get(Opc).getNumOperands();
if (MI.getNumOperands() < DescNumOps &&
Expand Down
10 changes: 10 additions & 0 deletions llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1322,6 +1322,16 @@ void AMDGPUInstPrinter::printOpSel(const MCInst *MI, unsigned,
const MCSubtargetInfo &STI,
raw_ostream &O) {
unsigned Opc = MI->getOpcode();
if (isCvt_F32_Fp8_Bf8_e64(Opc)) {
auto SrcMod =
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
unsigned Mod = MI->getOperand(SrcMod).getImm();
unsigned Index0 = !!(Mod & SISrcMods::OP_SEL_0);
unsigned Index1 = !!(Mod & SISrcMods::OP_SEL_1);
if (Index0 || Index1)
O << " op_sel:[" << Index0 << ',' << Index1 << ']';
return;
}
if (isPermlane16(Opc)) {
auto FIN = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
auto BCN = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers);
Expand Down
7 changes: 5 additions & 2 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -1696,8 +1696,9 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC,
!if(HasOMod,
(ins Src0Mod:$src0_modifiers, Src0RC:$src0,
clampmod0:$clamp, omod0:$omod),
(ins Src0Mod:$src0_modifiers, Src0RC:$src0,
clampmod0:$clamp))
!if (HasClamp,
(ins Src0Mod:$src0_modifiers, Src0RC:$src0, clampmod0:$clamp),
(ins Src0Mod:$src0_modifiers, Src0RC:$src0)))
/* else */,
// VOP1 without modifiers
!if (HasClamp,
Expand Down Expand Up @@ -2293,6 +2294,8 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
field bit IsWMMA = 0;
field bit IsSWMMAC = 0;

field bit IsFP8 = 0;

field bit HasDst = !ne(DstVT.Value, untyped.Value);
field bit HasDst32 = HasDst;
field bit EmitDst = HasDst; // force dst encoding, see v_movreld_b32 special case
Expand Down
11 changes: 11 additions & 0 deletions llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -541,6 +541,17 @@ bool isPermlane16(unsigned Opc) {
Opc == AMDGPU::V_PERMLANEX16_VAR_B32_e64_gfx12;
}

bool isCvt_F32_Fp8_Bf8_e64(unsigned Opc) {
return Opc == AMDGPU::V_CVT_F32_BF8_e64_gfx12 ||
Opc == AMDGPU::V_CVT_F32_FP8_e64_gfx12 ||
Opc == AMDGPU::V_CVT_F32_BF8_e64_dpp_gfx12 ||
Opc == AMDGPU::V_CVT_F32_FP8_e64_dpp_gfx12 ||
Opc == AMDGPU::V_CVT_F32_BF8_e64_dpp8_gfx12 ||
Opc == AMDGPU::V_CVT_F32_FP8_e64_dpp8_gfx12 ||
Opc == AMDGPU::V_CVT_PK_F32_BF8_e64_gfx12 ||
Opc == AMDGPU::V_CVT_PK_F32_FP8_e64_gfx12;
}

bool isGenericAtomic(unsigned Opc) {
return Opc == AMDGPU::G_AMDGPU_ATOMIC_FMIN ||
Opc == AMDGPU::G_AMDGPU_ATOMIC_FMAX ||
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -542,6 +542,9 @@ bool isPermlane16(unsigned Opc);
LLVM_READNONE
bool isGenericAtomic(unsigned Opc);

LLVM_READNONE
bool isCvt_F32_Fp8_Bf8_e64(unsigned Opc);

namespace VOPD {

enum Component : unsigned {
Expand Down
Loading

0 comments on commit 9545c24

Please sign in to comment.