diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl b/clang/test/CodeGenOpenCL/amdgpu-features.cl index 7495bca72a9df5..e1021241728c69 100644 --- a/clang/test/CodeGenOpenCL/amdgpu-features.cl +++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl @@ -100,8 +100,8 @@ // GFX1103: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" // GFX1150: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" // GFX1151: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" -// GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" -// GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" +// GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" +// GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" // GFX1103-W64: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize64" diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-fp8.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-fp8.cl index 56d757012a5e78..4e3a56b4201bb6 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-fp8.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-fp8.cl @@ -1,59 +1,60 @@ // REQUIRES: amdgpu-registered-target -// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx940 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX940 +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx940 -S -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -S -emit-llvm -o - %s | FileCheck %s typedef float v2f __attribute__((ext_vector_type(2))); -// CHECK-GFX940-LABEL: @test_cvt_f32_bf8 -// CHECK-GFX940: call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 0) +// CHECK-LABEL: @test_cvt_f32_bf8 +// CHECK: call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 0) void test_cvt_f32_bf8(global int* out, int a) { *out = __builtin_amdgcn_cvt_f32_bf8(a, 0); } -// CHECK-GFX940-LABEL: @test_cvt_f32_fp8 -// CHECK-GFX940: call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 1) +// CHECK-LABEL: @test_cvt_f32_fp8 +// CHECK: call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 1) void test_cvt_f32_fp8(global int* out, int a) { *out = __builtin_amdgcn_cvt_f32_fp8(a, 1); } -// CHECK-GFX940-LABEL: @test_cvt_pk_f32_bf8 -// CHECK-GFX940: call <2 x float> @llvm.amdgcn.cvt.pk.f32.bf8(i32 %a, i1 false) +// CHECK-LABEL: @test_cvt_pk_f32_bf8 +// CHECK: call <2 x float> @llvm.amdgcn.cvt.pk.f32.bf8(i32 %a, i1 false) void test_cvt_pk_f32_bf8(global v2f* out, int a) { *out = __builtin_amdgcn_cvt_pk_f32_bf8(a, false); } -// CHECK-GFX940-LABEL: @test_cvt_pk_f32_fp8 -// CHECK-GFX940: call <2 x float> @llvm.amdgcn.cvt.pk.f32.fp8(i32 %a, i1 true) +// CHECK-LABEL: @test_cvt_pk_f32_fp8 +// CHECK: call <2 x float> @llvm.amdgcn.cvt.pk.f32.fp8(i32 %a, i1 true) void test_cvt_pk_f32_fp8(global v2f* out, int a) { *out = __builtin_amdgcn_cvt_pk_f32_fp8(a, true); } -// CHECK-GFX940-LABEL: @test_cvt_pk_bf8_f32 -// CHECK-GFX940: call i32 @llvm.amdgcn.cvt.pk.bf8.f32(float %a, float %b, i32 %old, i1 false) +// CHECK-LABEL: @test_cvt_pk_bf8_f32 +// CHECK: call i32 @llvm.amdgcn.cvt.pk.bf8.f32(float %a, float %b, i32 %old, i1 false) void test_cvt_pk_bf8_f32(global int* out, int old, float a, float b) { *out = __builtin_amdgcn_cvt_pk_bf8_f32(a, b, old, false); } -// CHECK-GFX940-LABEL: @test_cvt_pk_fp8_f32 -// CHECK-GFX940: call i32 @llvm.amdgcn.cvt.pk.fp8.f32(float %a, float %b, i32 %old, i1 true) +// CHECK-LABEL: @test_cvt_pk_fp8_f32 +// CHECK: call i32 @llvm.amdgcn.cvt.pk.fp8.f32(float %a, float %b, i32 %old, i1 true) void test_cvt_pk_fp8_f32(global int* out, int old, float a, float b) { *out = __builtin_amdgcn_cvt_pk_fp8_f32(a, b, old, true); } -// CHECK-GFX940-LABEL: @test_cvt_sr_bf8_f32 -// CHECK-GFX940: call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %a, i32 %b, i32 %old, i32 2) +// CHECK-LABEL: @test_cvt_sr_bf8_f32 +// CHECK: call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %a, i32 %b, i32 %old, i32 2) void test_cvt_sr_bf8_f32(global int* out, int old, float a, int b) { *out = __builtin_amdgcn_cvt_sr_bf8_f32(a, b, old, 2); } -// CHECK-GFX940-LABEL: @test_cvt_sr_fp8_f32 -// CHECK-GFX940: call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %a, i32 %b, i32 %old, i32 3) +// CHECK-LABEL: @test_cvt_sr_fp8_f32 +// CHECK: call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %a, i32 %b, i32 %old, i32 3) void test_cvt_sr_fp8_f32(global int* out, int old, float a, int b) { *out = __builtin_amdgcn_cvt_sr_fp8_f32(a, b, old, 3); diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index c3acd5872684a9..a266cfa79368ee 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1507,6 +1507,7 @@ def FeatureISAVersion12 : FeatureSet< FeatureFlatAtomicFaddF32Inst, FeatureImageInsts, FeatureExtendedImageInsts, + FeatureFP8ConversionInsts, FeaturePackedTID, FeatureVcmpxPermlaneHazard, FeatureSALUFloatInsts, diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 7758566048711c..bf7f5272783269 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -3530,6 +3530,9 @@ bool AMDGPUAsmParser::usesConstantBus(const MCInst &Inst, unsigned OpIdx) { return !isInlineConstant(Inst, OpIdx); } else if (MO.isReg()) { auto Reg = MO.getReg(); + if (!Reg) { + return false; + } const MCRegisterInfo *TRI = getContext().getRegisterInfo(); auto PReg = mc2PseudoReg(Reg); return isSGPR(PReg, TRI) && PReg != SGPR_NULL; @@ -8427,12 +8430,20 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands, const bool IsPacked = (Desc.TSFlags & SIInstrFlags::IsPacked) != 0; if (Opc == AMDGPU::V_CVT_SR_BF8_F32_vi || - Opc == AMDGPU::V_CVT_SR_FP8_F32_vi) { + Opc == AMDGPU::V_CVT_SR_FP8_F32_vi || + Opc == AMDGPU::V_CVT_SR_BF8_F32_e64_gfx12 || + Opc == AMDGPU::V_CVT_SR_FP8_F32_e64_gfx12) { Inst.addOperand(MCOperand::createImm(0)); // Placeholder for src2_mods Inst.addOperand(Inst.getOperand(0)); } - if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vdst_in)) { + // Adding vdst_in operand is already covered for these DPP instructions in + // cvtVOP3DPP. + if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vdst_in) && + !(Opc == AMDGPU::V_CVT_PK_BF8_F32_e64_dpp_gfx12 || + Opc == AMDGPU::V_CVT_PK_FP8_F32_e64_dpp_gfx12 || + Opc == AMDGPU::V_CVT_PK_BF8_F32_e64_dpp8_gfx12 || + Opc == AMDGPU::V_CVT_PK_FP8_F32_e64_dpp8_gfx12)) { assert(!IsPacked); Inst.addOperand(Inst.getOperand(0)); } @@ -8933,6 +8944,22 @@ void AMDGPUAsmParser::cvtVOP3DPP(MCInst &Inst, const OperandVector &Operands, } } + int VdstInIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in); + if (VdstInIdx == static_cast(Inst.getNumOperands())) { + Inst.addOperand(Inst.getOperand(0)); + } + + bool IsVOP3CvtSrDpp = Opc == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp8_gfx12 || + Opc == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp8_gfx12 || + Opc == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp_gfx12 || + Opc == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp_gfx12; + if (IsVOP3CvtSrDpp) { + if (Src2ModIdx == static_cast(Inst.getNumOperands())) { + Inst.addOperand(MCOperand::createImm(0)); + Inst.addOperand(MCOperand::createReg(0)); + } + } + auto TiedTo = Desc.getOperandConstraint(Inst.getNumOperands(), MCOI::TIED_TO); if (TiedTo != -1) { diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index ea105d2c738c62..b991dd2e1a555b 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -720,6 +720,13 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, AMDGPU::OpName::src2_modifiers); } + if (Res && (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp || + MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp)) { + // Insert dummy unused src2_modifiers. + insertNamedMCOperand(MI, MCOperand::createImm(0), + AMDGPU::OpName::src2_modifiers); + } + if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::DS) && !AMDGPU::hasGDS(STI)) { insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::gds); @@ -950,6 +957,7 @@ void AMDGPUDisassembler::convertMacDPPInst(MCInst &MI) const { // first add optional MI operands to check FI DecodeStatus AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const { unsigned Opc = MI.getOpcode(); + if (MCII->get(Opc).TSFlags & SIInstrFlags::VOP3P) { convertVOP3PDPPInst(MI); } else if ((MCII->get(Opc).TSFlags & SIInstrFlags::VOPC) || @@ -959,6 +967,15 @@ DecodeStatus AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const { if (isMacDPP(MI)) convertMacDPPInst(MI); + int VDstInIdx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst_in); + if (VDstInIdx != -1) + insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::vdst_in); + + if (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp8_gfx12 || + MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp8_gfx12) + insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::src2); + unsigned DescNumOps = MCII->get(Opc).getNumOperands(); if (MI.getNumOperands() < DescNumOps && AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel)) { @@ -985,6 +1002,15 @@ DecodeStatus AMDGPUDisassembler::convertVOP3DPPInst(MCInst &MI) const { if (isMacDPP(MI)) convertMacDPPInst(MI); + int VDstInIdx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst_in); + if (VDstInIdx != -1) + insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::vdst_in); + + if (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp_gfx12 || + MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp_gfx12) + insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::src2); + unsigned Opc = MI.getOpcode(); unsigned DescNumOps = MCII->get(Opc).getNumOperands(); if (MI.getNumOperands() < DescNumOps && diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index 007785dc491314..abfa4a3531e8e1 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -1322,6 +1322,16 @@ void AMDGPUInstPrinter::printOpSel(const MCInst *MI, unsigned, const MCSubtargetInfo &STI, raw_ostream &O) { unsigned Opc = MI->getOpcode(); + if (isCvt_F32_Fp8_Bf8_e64(Opc)) { + auto SrcMod = + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers); + unsigned Mod = MI->getOperand(SrcMod).getImm(); + unsigned Index0 = !!(Mod & SISrcMods::OP_SEL_0); + unsigned Index1 = !!(Mod & SISrcMods::OP_SEL_1); + if (Index0 || Index1) + O << " op_sel:[" << Index0 << ',' << Index1 << ']'; + return; + } if (isPermlane16(Opc)) { auto FIN = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers); auto BCN = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 64997a0e006b83..122bdbfc9aadbb 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -1696,8 +1696,9 @@ class getIns64 _ArgVT, bit _EnableClamp = 0> { field bit IsWMMA = 0; field bit IsSWMMAC = 0; + field bit IsFP8 = 0; + field bit HasDst = !ne(DstVT.Value, untyped.Value); field bit HasDst32 = HasDst; field bit EmitDst = HasDst; // force dst encoding, see v_movreld_b32 special case diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index ccf6b553a7158c..59f551f4b1f736 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -541,6 +541,17 @@ bool isPermlane16(unsigned Opc) { Opc == AMDGPU::V_PERMLANEX16_VAR_B32_e64_gfx12; } +bool isCvt_F32_Fp8_Bf8_e64(unsigned Opc) { + return Opc == AMDGPU::V_CVT_F32_BF8_e64_gfx12 || + Opc == AMDGPU::V_CVT_F32_FP8_e64_gfx12 || + Opc == AMDGPU::V_CVT_F32_BF8_e64_dpp_gfx12 || + Opc == AMDGPU::V_CVT_F32_FP8_e64_dpp_gfx12 || + Opc == AMDGPU::V_CVT_F32_BF8_e64_dpp8_gfx12 || + Opc == AMDGPU::V_CVT_F32_FP8_e64_dpp8_gfx12 || + Opc == AMDGPU::V_CVT_PK_F32_BF8_e64_gfx12 || + Opc == AMDGPU::V_CVT_PK_F32_FP8_e64_gfx12; +} + bool isGenericAtomic(unsigned Opc) { return Opc == AMDGPU::G_AMDGPU_ATOMIC_FMIN || Opc == AMDGPU::G_AMDGPU_ATOMIC_FMAX || diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 351563e957f14a..b5c382c512a18c 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -542,6 +542,9 @@ bool isPermlane16(unsigned Opc); LLVM_READNONE bool isGenericAtomic(unsigned Opc); +LLVM_READNONE +bool isCvt_F32_Fp8_Bf8_e64(unsigned Opc); + namespace VOPD { enum Component : unsigned { diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index d9cb656c5fc0d5..41182f5f1951bb 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -571,6 +571,7 @@ let SubtargetPredicate = isGFX9Only in { } // End SubtargetPredicate = isGFX9Only class VOPProfile_Base_CVT_F32_F8 : VOPProfileI2F { + let HasExtDPP = 1; let HasExtSDWA = 1; let HasExtSDWA9 = 1; let HasExt = 1; @@ -599,6 +600,7 @@ class Cvt_F32_F8_Pat; +let SubtargetPredicate = isGFX9Only in { let OtherPredicates = [HasCvtFP8VOP1Bug] in { def : GCNPat<(f32 (int_amdgcn_cvt_f32_fp8 i32:$src, 0)), (V_CVT_F32_FP8_sdwa 0, $src, 0, 0, 0)>; @@ -617,6 +619,7 @@ foreach Index = [1, 2, 3] in { def : Cvt_F32_F8_Pat; def : Cvt_F32_F8_Pat; } +} // End SubtargetPredicate = isGFX9Only class Cvt_PK_F32_F8_Pat : GCNPat< @@ -626,11 +629,77 @@ class Cvt_PK_F32_F8_Pat; -foreach Index = [0, -1] in { - def : Cvt_PK_F32_F8_Pat; - def : Cvt_PK_F32_F8_Pat; +let SubtargetPredicate = isGFX9Only in { + foreach Index = [0, -1] in { + def : Cvt_PK_F32_F8_Pat; + def : Cvt_PK_F32_F8_Pat; + } +} + + +// Similar to VOPProfile_Base_CVT_F32_F8, but for VOP3 instructions. +def VOPProfile_Base_CVT_PK_F32_F8_OpSel : VOPProfileI2F { + let HasOpSel = 1; + let HasExtVOP3DPP = 0; +} + +def VOPProfile_Base_CVT_F32_F8_OpSel : VOPProfile<[f32, i32, untyped, untyped]> { + let HasOpSel = 1; + let HasExtDPP = 1; + let HasExtVOP3DPP = 1; + let IsFP8 = 1; + let HasClamp = 0; + let HasOMod = 0; + let HasModifiers = 1; + let Src1VOP3DPP = Src1RC64; +} + +let SubtargetPredicate = isGFX12Plus, mayRaiseFPException = 0, + SchedRW = [WriteFloatCvt] in { + defm V_CVT_F32_FP8_OP_SEL : VOP1Inst<"v_cvt_f32_fp8_op_sel", VOPProfile_Base_CVT_F32_F8_OpSel>; + defm V_CVT_F32_BF8_OP_SEL : VOP1Inst<"v_cvt_f32_bf8_op_sel", VOPProfile_Base_CVT_F32_F8_OpSel>; + defm V_CVT_PK_F32_FP8_OP_SEL : VOP1Inst<"v_cvt_pk_f32_fp8_op_sel", VOPProfile_Base_CVT_PK_F32_F8_OpSel>; + defm V_CVT_PK_F32_BF8_OP_SEL : VOP1Inst<"v_cvt_pk_f32_bf8_op_sel", VOPProfile_Base_CVT_PK_F32_F8_OpSel>; +} + +class Cvt_F32_F8_Pat_OpSel index, + VOP1_Pseudo inst_e32, VOP3_Pseudo inst_e64> : GCNPat< + (f32 (node i32:$src, index)), + !if (index, + (inst_e64 !if(index{0}, + !if(index{1}, !or(SRCMODS.OP_SEL_0, SRCMODS.OP_SEL_1), + SRCMODS.OP_SEL_0), + !if(index{1}, SRCMODS.OP_SEL_1, 0)), + $src, 0), + (inst_e32 $src)) +>; + +let SubtargetPredicate = isGFX12Plus in { + foreach Index = [0, 1, 2, 3] in { + def : Cvt_F32_F8_Pat_OpSel; + def : Cvt_F32_F8_Pat_OpSel; + } +} + +class Cvt_PK_F32_F8_Pat_OpSel : GCNPat< + (v2f32 (node i32:$src, index)), + !if (index, + (inst_e64 SRCMODS.OP_SEL_0, $src, 0, 0, SRCMODS.NONE), + (inst_e32 $src)) +>; + +let SubtargetPredicate = isGFX12Plus in { + foreach Index = [0, -1] in { + def : Cvt_PK_F32_F8_Pat_OpSel; + def : Cvt_PK_F32_F8_Pat_OpSel; + } } let SubtargetPredicate = isGFX10Plus in { @@ -854,6 +923,20 @@ multiclass VOP1_Real_NO_DPP_OP_SEL_with_name op, VOP3_Real_with_name; +// Define VOP1 instructions using the pseudo instruction with its old profile and +// VOP3 using the OpSel profile for the pseudo instruction. +defm V_CVT_F32_FP8 : VOP1_Real_NO_VOP3_with_name_gfx12<0x06c, "V_CVT_F32_FP8", "v_cvt_f32_fp8">; +defm V_CVT_F32_FP8 : VOP1_Realtriple_e64_with_name; + +defm V_CVT_F32_BF8 : VOP1_Real_NO_VOP3_with_name_gfx12<0x06d, "V_CVT_F32_BF8", "v_cvt_f32_bf8">; +defm V_CVT_F32_BF8 : VOP1_Realtriple_e64_with_name; + +defm V_CVT_PK_F32_FP8 : VOP1_Real_e32_with_name; +defm V_CVT_PK_F32_FP8 : VOP3_Real_with_name; + +defm V_CVT_PK_F32_BF8 : VOP1_Real_e32_with_name; +defm V_CVT_PK_F32_BF8 : VOP3_Real_with_name; + defm V_CVT_NEAREST_I32_F32 : VOP1_Real_FULL_with_name_gfx11_gfx12<0x00c, "V_CVT_RPI_I32_F32", "v_cvt_nearest_i32_f32">; defm V_CVT_FLOOR_I32_F32 : VOP1_Real_FULL_with_name_gfx11_gfx12<0x00d, diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 713b4712d563c0..14db5221021489 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -520,8 +520,26 @@ def VOP3_CVT_PK_F8_F32_Profile : VOP3_Profile { let InsVOP3OpSel = (ins FP32InputMods:$src0_modifiers, Src0RC64:$src0, FP32InputMods:$src1_modifiers, Src1RC64:$src1, VGPR_32:$vdst_in, op_sel0:$op_sel); + let InsVOP3DPP = (ins VGPR_32:$old, + FP32InputMods:$src0_modifiers, Src0VOP3DPP:$src0, + FP32InputMods:$src1_modifiers, Src1VOP3DPP:$src1, + VGPR_32:$vdst_in, op_sel0:$op_sel, + dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, + bank_mask:$bank_mask, bound_ctrl:$bound_ctrl); + + let InsVOP3DPP16 = (ins VGPR_32:$old, + FP32InputMods:$src0_modifiers, Src0VOP3DPP:$src0, + FP32InputMods:$src1_modifiers, Src1VOP3DPP:$src1, + VGPR_32:$vdst_in, op_sel0:$op_sel, + dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, + bank_mask:$bank_mask, bound_ctrl:$bound_ctrl, FI:$fi); + let InsVOP3DPP8 = (ins VGPR_32:$old, + FP32InputMods:$src0_modifiers, Src0VOP3DPP:$src0, + FP32InputMods:$src1_modifiers, Src1VOP3DPP:$src1, + VGPR_32:$vdst_in, op_sel0:$op_sel, dpp8:$dpp8, FI:$fi); + let HasClamp = 0; - let HasExtVOP3DPP = 0; + let HasExtVOP3DPP = 1; } def VOP3_CVT_SR_F8_F32_Profile : VOP3_Profile, @@ -530,14 +548,36 @@ def VOP3_CVT_SR_F8_F32_Profile : VOP3_Profile, FP32InputMods:$src1_modifiers, Src1RC64:$src1, FP32InputMods:$src2_modifiers, VGPR_32:$src2, op_sel0:$op_sel); + let InsVOP3DPP16 = (ins VGPR_32:$old, + FP32InputMods:$src0_modifiers, Src0VOP3DPP:$src0, + FP32InputMods:$src1_modifiers, Src1VOP3DPP:$src1, + FP32InputMods:$src2_modifiers, VGPR_32:$src2, + op_sel0:$op_sel, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, + bank_mask:$bank_mask, bound_ctrl:$bound_ctrl, FI:$fi); + let InsVOP3DPP8 = (ins VGPR_32:$old, + FP32InputMods:$src0_modifiers, Src0VOP3DPP:$src0, + FP32InputMods:$src1_modifiers, Src1VOP3DPP:$src1, + FP32InputMods:$src2_modifiers, VGPR_32:$src2, + op_sel0:$op_sel, dpp8:$dpp8, FI:$fi); let HasClamp = 0; let HasSrc2 = 0; let HasSrc2Mods = 1; + let HasExtVOP3DPP = 1; + let HasOpSel = 1; let AsmVOP3OpSel = !subst(", $src2_modifiers", "", getAsmVOP3OpSel<3, HasClamp, HasOMod, HasSrc0FloatMods, HasSrc1FloatMods, HasSrc2FloatMods>.ret); - let HasExtVOP3DPP = 0; + let AsmVOP3DPP16 = !subst(", $src2_modifiers", "", + getAsmVOP3DPP16.ret>.ret); + let AsmVOP3DPP8 = !subst(", $src2_modifiers", "", + getAsmVOP3DPP8.ret>.ret); } def IsPow2Plus1: PatLeaf<(i32 imm), [{ @@ -618,13 +658,13 @@ let SubtargetPredicate = HasFP8ConversionInsts, mayRaiseFPException = 0, class Cvt_PK_F8_F32_Pat : GCNPat< (i32 (node f32:$src0, f32:$src1, i32:$old, index)), - (inst !if(index, SRCMODS.DST_OP_SEL, 0), $src0, 0, $src1, $old, !if(index, SRCMODS.OP_SEL_0, 0)) + (inst !if(index, SRCMODS.DST_OP_SEL, 0), $src0, 0, $src1, $old, 0) >; class Cvt_SR_F8_F32_Pat index, VOP3_Pseudo inst> : GCNPat< (i32 (node f32:$src0, i32:$src1, i32:$old, index)), (inst !if(index{1}, SRCMODS.DST_OP_SEL, 0), $src0, 0, $src1, - !if(index{0}, SRCMODS.OP_SEL_0, 0), $old, !if(index{1}, SRCMODS.OP_SEL_0, 0)) + !if(index{0}, SRCMODS.OP_SEL_0, 0), $old, 0) >; foreach Index = [0, -1] in { @@ -998,6 +1038,11 @@ defm V_MAXIMUM_F16 : VOP3Only_Realtriple_t16_gfx12<0x368>; defm V_PERMLANE16_VAR_B32 : VOP3Only_Real_Base_gfx12<0x30f>; defm V_PERMLANEX16_VAR_B32 : VOP3Only_Real_Base_gfx12<0x310>; +defm V_CVT_PK_FP8_F32 : VOP3Only_Realtriple_gfx12<0x369>; +defm V_CVT_PK_BF8_F32 : VOP3Only_Realtriple_gfx12<0x36a>; +defm V_CVT_SR_FP8_F32 : VOP3Only_Realtriple_gfx12<0x36b>; +defm V_CVT_SR_BF8_F32 : VOP3Only_Realtriple_gfx12<0x36c>; + //===----------------------------------------------------------------------===// // GFX11, GFX12 //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index fae07852ffc425..20d7c88fb7e59f 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -306,6 +306,11 @@ class VOP3OpSel_gfx10 op, VOPProfile p> : VOP3e_gfx10 { class VOP3OpSel_gfx11_gfx12 op, VOPProfile p> : VOP3OpSel_gfx10; +class VOP3FP8OpSel_gfx11_gfx12 op, VOPProfile p> : VOP3e_gfx10 { + let Inst{11} = !if(p.HasSrc0, src0_modifiers{2}, 0); + let Inst{12} = !if(p.HasSrc0, src0_modifiers{3}, 0); +} + class VOP3DotOpSel_gfx11_gfx12 op, VOPProfile p> : VOP3OpSel_gfx11_gfx12{ let Inst{11} = ?; let Inst{12} = ?; @@ -741,7 +746,7 @@ class VOP3_DPPe_Common_Base op, VOPProfile P> : Enc96 { let Inst{10} = !if(P.HasSrc2Mods, src2_modifiers{1}, 0); // OPSEL must be set such that the low result only uses low inputs, and the high result only uses high inputs. let Inst{11} = !if(P.HasOpSel,!if(P.HasSrc0Mods, src0_modifiers{2}, 0),?); - let Inst{12} = !if(P.HasOpSel,!if(P.HasSrc1Mods, src1_modifiers{2}, 0),?); + let Inst{12} = !if(P.HasOpSel,!if(P.HasSrc1Mods, src1_modifiers{2}, !if((P.IsFP8), src0_modifiers{3}, 0)), ?); let Inst{13} = !if(P.HasOpSel,!if(P.HasSrc2Mods, src2_modifiers{2}, 0),?); let Inst{14} = !if(P.HasOpSel,!if(P.HasSrc0Mods, src0_modifiers{3}, 0),?); let Inst{15} = !if(P.HasClamp, clamp, 0); @@ -1409,14 +1414,20 @@ multiclass VOP3_Real_with_name op, string opName, defvar ps = !cast(opName#"_e64"); let AsmString = asmName # ps.AsmOperands, IsSingle = !or(isSingle, ps.Pfl.IsSingle) in { - if ps.Pfl.HasOpSel then - def _e64#Gen.Suffix : - VOP3_Real_Gen, - VOP3OpSel_gfx11_gfx12; - if !not(ps.Pfl.HasOpSel) then - def _e64#Gen.Suffix : - VOP3_Real_Gen, - VOP3e_gfx11_gfx12; + if ps.Pfl.IsFP8 then { + def _e64#Gen.Suffix : + VOP3_Real_Gen, + VOP3FP8OpSel_gfx11_gfx12; + } else { + if ps.Pfl.HasOpSel then + def _e64#Gen.Suffix : + VOP3_Real_Gen, + VOP3OpSel_gfx11_gfx12; + if !not(ps.Pfl.HasOpSel) then + def _e64#Gen.Suffix : + VOP3_Real_Gen, + VOP3e_gfx11_gfx12; + } } def Gen.Suffix#"_VOP3_alias" : MnemonicAlias, Requires<[Gen.AssemblerPredicate]>, LetDummies; } diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp index 2cfe23676d20f8..ecea6c30585228 100644 --- a/llvm/lib/TargetParser/TargetParser.cpp +++ b/llvm/lib/TargetParser/TargetParser.cpp @@ -295,7 +295,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T, Features["gfx12-insts"] = true; Features["atomic-fadd-rtn-insts"] = true; Features["image-insts"] = true; - Features["gws"] = true; + Features["fp8-conversion-insts"] = true; break; case GK_GFX1151: case GK_GFX1150: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll new file mode 100644 index 00000000000000..3620806971282e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll @@ -0,0 +1,130 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s + +define amdgpu_cs float @test_cvt_f32_bf8_byte0(i32 %a) { +; GFX12-LABEL: test_cvt_f32_bf8_byte0: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_cvt_f32_bf8_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX12-NEXT: ; return to shader part epilog + %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1) + %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %tmp0, i32 0) + ret float %ret +} + +define amdgpu_cs float @test_cvt_f32_bf8_byte1(i32 %a) { +; GFX12-LABEL: test_cvt_f32_bf8_byte1: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_cvt_f32_bf8_e64_dpp v0, v0 op_sel:[1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX12-NEXT: ; return to shader part epilog + %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1) + %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %tmp0, i32 1) + ret float %ret +} + +define amdgpu_cs float @test_cvt_f32_bf8_byte2(i32 %a) { +; GFX12-LABEL: test_cvt_f32_bf8_byte2: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_cvt_f32_bf8_e64_dpp v0, v0 op_sel:[0,1] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX12-NEXT: ; return to shader part epilog + %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1) + %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %tmp0, i32 2) + ret float %ret +} + +define amdgpu_cs float @test_cvt_f32_fp8_byte3(i32 %a) { +; GFX12-LABEL: test_cvt_f32_fp8_byte3: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_cvt_f32_fp8_e64_dpp v0, v0 op_sel:[1,1] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX12-NEXT: ; return to shader part epilog + %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1) + %ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %tmp0, i32 3) + ret float %ret +} + +define amdgpu_cs void @test_cvt_pk_bf8_f32_word0(i32 %a, float %y, i32 %old, ptr addrspace(1) %out) { +; GFX12-LABEL: test_cvt_pk_bf8_f32_word0: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_cvt_pk_bf8_f32_e64_dpp v2, v0, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX12-NEXT: global_store_b32 v[3:4], v2, off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1) + %tmp1 = bitcast i32 %tmp0 to float + %ret = tail call i32 @llvm.amdgcn.cvt.pk.bf8.f32(float %tmp1, float %y, i32 %old, i1 false) + store i32 %ret, ptr addrspace(1) %out + ret void +} + +define amdgpu_cs void @test_cvt_pk_fp8_f32_word1(i32 %a, float %y, i32 %old, ptr addrspace(1) %out) { +; GFX12-LABEL: test_cvt_pk_fp8_f32_word1: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_cvt_pk_fp8_f32_e64_dpp v2, v0, v1 op_sel:[0,0,1] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX12-NEXT: global_store_b32 v[3:4], v2, off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1) + %tmp1 = bitcast i32 %tmp0 to float + %ret = tail call i32 @llvm.amdgcn.cvt.pk.fp8.f32(float %tmp1, float %y, i32 %old, i1 true) + store i32 %ret, ptr addrspace(1) %out + ret void +} + +define amdgpu_cs void @test_cvt_sr_bf8_f32_byte0(i32 %a, i32 %r, i32 %old, ptr addrspace(1) %out) { +; GFX12-LABEL: test_cvt_sr_bf8_f32_byte0: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_cvt_sr_bf8_f32_e64_dpp v2, v0, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX12-NEXT: global_store_b32 v[3:4], v2, off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1) + %tmp1 = bitcast i32 %tmp0 to float + %ret = tail call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %tmp1, i32 %r, i32 %old, i32 0) + store i32 %ret, ptr addrspace(1) %out + ret void +} + +define amdgpu_cs void @test_cvt_sr_fp8_f32_byte1(i32 %a, i32 %r, i32 %old, ptr addrspace(1) %out) { +; GFX12-LABEL: test_cvt_sr_fp8_f32_byte1: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_cvt_sr_fp8_f32_e64_dpp v2, v0, v1 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX12-NEXT: global_store_b32 v[3:4], v2, off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1) + %tmp1 = bitcast i32 %tmp0 to float + %ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %tmp1, i32 %r, i32 %old, i32 1) + store i32 %ret, ptr addrspace(1) %out + ret void +} + +define amdgpu_cs void @test_cvt_sr_fp8_f32_byte2(i32 %a, i32 %r, i32 %old, ptr addrspace(1) %out) { +; GFX12-LABEL: test_cvt_sr_fp8_f32_byte2: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_cvt_sr_fp8_f32_e64_dpp v2, v0, v1 op_sel:[0,0,0,1] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX12-NEXT: global_store_b32 v[3:4], v2, off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1) + %tmp1 = bitcast i32 %tmp0 to float + %ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %tmp1, i32 %r, i32 %old, i32 2) + store i32 %ret, ptr addrspace(1) %out + ret void +} + +declare float @llvm.amdgcn.cvt.f32.bf8(i32, i32) +declare float @llvm.amdgcn.cvt.f32.fp8(i32, i32) +declare i32 @llvm.amdgcn.cvt.pk.bf8.f32(float, float, i32, i1) +declare i32 @llvm.amdgcn.cvt.pk.fp8.f32(float, float, i32, i1) +declare i32 @llvm.amdgcn.cvt.sr.bf8.f32(float, i32, i32, i32) +declare i32 @llvm.amdgcn.cvt.sr.fp8.f32(float, i32, i32, i32) + +declare i32 @llvm.amdgcn.mov.dpp.i32(i32, i32, i32, i32, i1) #1 +declare i32 @llvm.amdgcn.mov.dpp8.i32(i32, i32) #1 + +attributes #0 = { nounwind convergent } +attributes #1 = { nounwind readnone convergent } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.mir new file mode 100644 index 00000000000000..40d2923b68a8c0 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.mir @@ -0,0 +1,197 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass=gcn-dpp-combine %s -o - | FileCheck -check-prefix=GFX12 %s + +--- +name: test_cvt_f32_bf8_byte0 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; GFX12-LABEL: name: test_cvt_f32_bf8_byte0 + ; GFX12: liveins: $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GFX12-NEXT: [[V_CVT_F32_BF8_dpp:%[0-9]+]]:vgpr_32 = V_CVT_F32_BF8_dpp [[DEF]], [[COPY]], 228, 15, 15, 1, implicit $mode, implicit $exec + ; GFX12-NEXT: $vgpr0 = COPY [[V_CVT_F32_BF8_dpp]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = V_MOV_B32_dpp %0, %0, 228, 15, 15, -1, implicit $exec + %2:vgpr_32 = V_CVT_F32_BF8_e32 killed %1, implicit $mode, implicit $exec + $vgpr0 = COPY %2 + SI_RETURN_TO_EPILOG $vgpr0 + +... +--- +name: test_cvt_f32_bf8_byte2 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; GFX12-LABEL: name: test_cvt_f32_bf8_byte2 + ; GFX12: liveins: $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GFX12-NEXT: [[V_CVT_F32_BF8_OP_SEL_e64_dpp:%[0-9]+]]:vgpr_32 = V_CVT_F32_BF8_OP_SEL_e64_dpp [[DEF]], 8, [[COPY]], 0, 228, 15, 15, 1, implicit $mode, implicit $exec + ; GFX12-NEXT: $vgpr0 = COPY [[V_CVT_F32_BF8_OP_SEL_e64_dpp]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = V_MOV_B32_dpp %0, %0, 228, 15, 15, -1, implicit $exec + %2:vgpr_32 = V_CVT_F32_BF8_OP_SEL_e64 8, killed %1, 0, implicit $mode, implicit $exec + $vgpr0 = COPY %2 + SI_RETURN_TO_EPILOG $vgpr0 + +... +--- +name: test_cvt_f32_fp8_byte3 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; GFX12-LABEL: name: test_cvt_f32_fp8_byte3 + ; GFX12: liveins: $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GFX12-NEXT: [[V_CVT_F32_FP8_OP_SEL_e64_dpp:%[0-9]+]]:vgpr_32 = V_CVT_F32_FP8_OP_SEL_e64_dpp [[DEF]], 12, [[COPY]], 0, 228, 15, 15, 1, implicit $mode, implicit $exec + ; GFX12-NEXT: $vgpr0 = COPY [[V_CVT_F32_FP8_OP_SEL_e64_dpp]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = V_MOV_B32_dpp %0, %0, 228, 15, 15, -1, implicit $exec + %2:vgpr_32 = V_CVT_F32_FP8_OP_SEL_e64 12, killed %1, 0, implicit $mode, implicit $exec + $vgpr0 = COPY %2 + SI_RETURN_TO_EPILOG $vgpr0 + +... +--- +name: test_cvt_pk_bf8_f32_word0 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + + ; GFX12-LABEL: name: test_cvt_pk_bf8_f32_word0 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX12-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GFX12-NEXT: [[V_CVT_PK_BF8_F32_e64_dpp:%[0-9]+]]:vgpr_32 = V_CVT_PK_BF8_F32_e64_dpp [[DEF]], 0, [[COPY4]], 0, [[COPY3]], [[COPY2]], 0, 228, 15, 15, 1, implicit $mode, implicit $exec + ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE]], killed [[V_CVT_PK_BF8_F32_e64_dpp]], 0, 0, implicit $exec + ; GFX12-NEXT: S_ENDPGM 0 + %4:vgpr_32 = COPY $vgpr4 + %3:vgpr_32 = COPY $vgpr3 + %2:vgpr_32 = COPY $vgpr2 + %1:vgpr_32 = COPY $vgpr1 + %0:vgpr_32 = COPY $vgpr0 + %11:vreg_64 = REG_SEQUENCE %3, %subreg.sub0, %4, %subreg.sub1 + %6:vgpr_32 = V_MOV_B32_dpp %0, %0, 228, 15, 15, -1, implicit $exec + %7:vgpr_32 = V_CVT_PK_BF8_F32_e64 0, killed %6, 0, %1, %2, 0, implicit $mode, implicit $exec + GLOBAL_STORE_DWORD %11, killed %7, 0, 0, implicit $exec + S_ENDPGM 0 + +... +--- +name: test_cvt_pk_fp8_f32_word1 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + + ; GFX12-LABEL: name: test_cvt_pk_fp8_f32_word1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX12-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GFX12-NEXT: [[V_CVT_PK_FP8_F32_e64_dpp:%[0-9]+]]:vgpr_32 = V_CVT_PK_FP8_F32_e64_dpp [[DEF]], 8, [[COPY4]], 0, [[COPY3]], [[COPY2]], 0, 228, 15, 15, 1, implicit $mode, implicit $exec + ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE]], killed [[V_CVT_PK_FP8_F32_e64_dpp]], 0, 0, implicit $exec + ; GFX12-NEXT: S_ENDPGM 0 + %4:vgpr_32 = COPY $vgpr4 + %3:vgpr_32 = COPY $vgpr3 + %2:vgpr_32 = COPY $vgpr2 + %1:vgpr_32 = COPY $vgpr1 + %0:vgpr_32 = COPY $vgpr0 + %11:vreg_64 = REG_SEQUENCE %3, %subreg.sub0, %4, %subreg.sub1 + %6:vgpr_32 = V_MOV_B32_dpp %0, %0, 228, 15, 15, -1, implicit $exec + %7:vgpr_32 = V_CVT_PK_FP8_F32_e64 8, killed %6, 0, %1, %2, 0, implicit $mode, implicit $exec + GLOBAL_STORE_DWORD %11, killed %7, 0, 0, implicit $exec + S_ENDPGM 0 + +... +--- +name: test_cvt_sr_bf8_f32_byte0 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + + ; GFX12-LABEL: name: test_cvt_sr_bf8_f32_byte0 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX12-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GFX12-NEXT: [[V_CVT_SR_BF8_F32_e64_dpp:%[0-9]+]]:vgpr_32 = V_CVT_SR_BF8_F32_e64_dpp [[DEF]], 0, [[COPY4]], 0, [[COPY3]], 0, [[COPY2]], 0, 228, 15, 15, 1, implicit $mode, implicit $exec + ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE]], killed [[V_CVT_SR_BF8_F32_e64_dpp]], 0, 0, implicit $exec + ; GFX12-NEXT: S_ENDPGM 0 + %4:vgpr_32 = COPY $vgpr4 + %3:vgpr_32 = COPY $vgpr3 + %2:vgpr_32 = COPY $vgpr2 + %1:vgpr_32 = COPY $vgpr1 + %0:vgpr_32 = COPY $vgpr0 + %11:vreg_64 = REG_SEQUENCE %3, %subreg.sub0, %4, %subreg.sub1 + %6:vgpr_32 = V_MOV_B32_dpp %0, %0, 228, 15, 15, -1, implicit $exec + %7:vgpr_32 = V_CVT_SR_BF8_F32_e64 0, killed %6, 0, %1, 0, %2, 0, implicit $mode, implicit $exec + GLOBAL_STORE_DWORD %11, killed %7, 0, 0, implicit $exec + S_ENDPGM 0 + +... +--- +name: test_cvt_sr_fp8_f32_byte2 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + + ; GFX12-LABEL: name: test_cvt_sr_fp8_f32_byte2 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX12-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GFX12-NEXT: [[V_CVT_SR_FP8_F32_e64_dpp:%[0-9]+]]:vgpr_32 = V_CVT_SR_FP8_F32_e64_dpp [[DEF]], 8, [[COPY4]], 0, [[COPY3]], 0, [[COPY2]], 0, 228, 15, 15, 1, implicit $mode, implicit $exec + ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE]], killed [[V_CVT_SR_FP8_F32_e64_dpp]], 0, 0, implicit $exec + ; GFX12-NEXT: S_ENDPGM 0 + %4:vgpr_32 = COPY $vgpr4 + %3:vgpr_32 = COPY $vgpr3 + %2:vgpr_32 = COPY $vgpr2 + %1:vgpr_32 = COPY $vgpr1 + %0:vgpr_32 = COPY $vgpr0 + %11:vreg_64 = REG_SEQUENCE %3, %subreg.sub0, %4, %subreg.sub1 + %6:vgpr_32 = V_MOV_B32_dpp %0, %0, 228, 15, 15, -1, implicit $exec + %7:vgpr_32 = V_CVT_SR_FP8_F32_e64 8, killed %6, 0, %1, 0, %2, 0, implicit $mode, implicit $exec + GLOBAL_STORE_DWORD %11, killed %7, 0, 0, implicit $exec + S_ENDPGM 0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll index 26d0d702d99db4..17b1fcf865e94e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll @@ -1,4 +1,6 @@ -; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX940 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s declare float @llvm.amdgcn.cvt.f32.bf8(i32, i32) declare float @llvm.amdgcn.cvt.f32.fp8(i32, i32) @@ -9,182 +11,524 @@ declare i32 @llvm.amdgcn.cvt.pk.fp8.f32(float, float, i32, i1) declare i32 @llvm.amdgcn.cvt.sr.bf8.f32(float, i32, i32, i32) declare i32 @llvm.amdgcn.cvt.sr.fp8.f32(float, i32, i32, i32) -; GCN-LABEL: {{^}}test_cvt_f32_bf8_byte0: -; GCN: v_cvt_f32_bf8_sdwa v0, v0 src0_sel:BYTE_0{{$}} define float @test_cvt_f32_bf8_byte0(i32 %a) { +; GFX940-LABEL: test_cvt_f32_bf8_byte0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_f32_bf8_sdwa v0, v0 src0_sel:BYTE_0 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_f32_bf8_byte0: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_f32_bf8_e32 v0, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 0) ret float %ret } -; GCN-LABEL: {{^}}test_cvt_f32_bf8_byte1: -; GCN: v_cvt_f32_bf8_sdwa v0, v0 src0_sel:BYTE_1 define float @test_cvt_f32_bf8_byte1(i32 %a) { +; GFX940-LABEL: test_cvt_f32_bf8_byte1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_f32_bf8_sdwa v0, v0 src0_sel:BYTE_1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_f32_bf8_byte1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 op_sel:[1,0] +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 1) ret float %ret } -; GCN-LABEL: {{^}}test_cvt_f32_bf8_byte2: -; GCN: v_cvt_f32_bf8_sdwa v0, v0 src0_sel:BYTE_2 define float @test_cvt_f32_bf8_byte2(i32 %a) { +; GFX940-LABEL: test_cvt_f32_bf8_byte2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_f32_bf8_sdwa v0, v0 src0_sel:BYTE_2 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_f32_bf8_byte2: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 op_sel:[0,1] +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 2) ret float %ret } -; GCN-LABEL: {{^}}test_cvt_f32_bf8_byte3: -; GCN: v_cvt_f32_bf8_sdwa v0, v0 src0_sel:BYTE_3 define float @test_cvt_f32_bf8_byte3(i32 %a) { +; GFX940-LABEL: test_cvt_f32_bf8_byte3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_f32_bf8_sdwa v0, v0 src0_sel:BYTE_3 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_f32_bf8_byte3: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 op_sel:[1,1] +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 3) ret float %ret } -; GCN-LABEL: {{^}}test_cvt_f32_fp8_byte0: -; GCN: v_cvt_f32_fp8_sdwa v0, v0 src0_sel:BYTE_0{{$}} define float @test_cvt_f32_fp8_byte0(i32 %a) { +; GFX940-LABEL: test_cvt_f32_fp8_byte0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_f32_fp8_sdwa v0, v0 src0_sel:BYTE_0 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_f32_fp8_byte0: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_f32_fp8_e32 v0, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 0) ret float %ret } -; GCN-LABEL: {{^}}test_cvt_f32_fp8_byte1: -; GCN: v_cvt_f32_fp8_sdwa v0, v0 src0_sel:BYTE_1 define float @test_cvt_f32_fp8_byte1(i32 %a) { +; GFX940-LABEL: test_cvt_f32_fp8_byte1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_f32_fp8_sdwa v0, v0 src0_sel:BYTE_1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_f32_fp8_byte1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_f32_fp8_e64 v0, v0 op_sel:[1,0] +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 1) ret float %ret } -; GCN-LABEL: {{^}}test_cvt_f32_fp8_byte2: -; GCN: v_cvt_f32_fp8_sdwa v0, v0 src0_sel:BYTE_2 define float @test_cvt_f32_fp8_byte2(i32 %a) { +; GFX940-LABEL: test_cvt_f32_fp8_byte2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_f32_fp8_sdwa v0, v0 src0_sel:BYTE_2 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_f32_fp8_byte2: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_f32_fp8_e64 v0, v0 op_sel:[0,1] +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 2) ret float %ret } -; GCN-LABEL: {{^}}test_cvt_f32_fp8_byte3: -; GCN: v_cvt_f32_fp8_sdwa v0, v0 src0_sel:BYTE_3 define float @test_cvt_f32_fp8_byte3(i32 %a) { +; GFX940-LABEL: test_cvt_f32_fp8_byte3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_f32_fp8_sdwa v0, v0 src0_sel:BYTE_3 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_f32_fp8_byte3: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_f32_fp8_e64 v0, v0 op_sel:[1,1] +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 3) ret float %ret } -; GCN-LABEL: {{^}}test_cvt_pk_f32_bf8_word0: -; GCN: v_cvt_pk_f32_bf8_e32 v[0:1], v0{{$}} define <2 x float> @test_cvt_pk_f32_bf8_word0(i32 %a) { +; GFX940-LABEL: test_cvt_pk_f32_bf8_word0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_pk_f32_bf8_e32 v[0:1], v0 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_pk_f32_bf8_word0: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_pk_f32_bf8_e32 v[0:1], v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.bf8(i32 %a, i1 false) ret <2 x float> %ret } -; GCN-LABEL: {{^}}test_cvt_pk_f32_bf8_word1: -; GCN: v_cvt_pk_f32_bf8_sdwa v[0:1], v0 src0_sel:WORD_1 define <2 x float> @test_cvt_pk_f32_bf8_word1(i32 %a) { +; GFX940-LABEL: test_cvt_pk_f32_bf8_word1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_pk_f32_bf8_sdwa v[0:1], v0 src0_sel:WORD_1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_pk_f32_bf8_word1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_pk_f32_bf8_e64 v[0:1], v0 op_sel:[1,0] +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.bf8(i32 %a, i1 true) ret <2 x float> %ret } -; GCN-LABEL: {{^}}test_cvt_pk_f32_fp8_word0: -; GCN: v_cvt_pk_f32_fp8_e32 v[0:1], v0{{$}} define <2 x float> @test_cvt_pk_f32_fp8_word0(i32 %a) { +; GFX940-LABEL: test_cvt_pk_f32_fp8_word0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_pk_f32_fp8_e32 v[0:1], v0 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_pk_f32_fp8_word0: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_pk_f32_fp8_e32 v[0:1], v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.fp8(i32 %a, i1 false) ret <2 x float> %ret } -; GCN-LABEL: {{^}}test_cvt_pk_f32_fp8_word1: -; GCN: v_cvt_pk_f32_fp8_sdwa v[0:1], v0 src0_sel:WORD_1 define <2 x float> @test_cvt_pk_f32_fp8_word1(i32 %a) { +; GFX940-LABEL: test_cvt_pk_f32_fp8_word1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_pk_f32_fp8_sdwa v[0:1], v0 src0_sel:WORD_1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_pk_f32_fp8_word1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_pk_f32_fp8_e64 v[0:1], v0 op_sel:[1,0] +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.fp8(i32 %a, i1 true) ret <2 x float> %ret } -; GCN-LABEL: {{^}}test_cvt_pk_bf8_f32_word0: -; GCN: v_cvt_pk_bf8_f32 v2, v0, v1{{$}} -; GCN: v_mov_b32_e32 v0, v2 define i32 @test_cvt_pk_bf8_f32_word0(float %x, float %y, i32 %old) { +; GFX940-LABEL: test_cvt_pk_bf8_f32_word0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_pk_bf8_f32 v2, v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_pk_bf8_f32_word0: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_pk_bf8_f32 v2, v0, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call i32 @llvm.amdgcn.cvt.pk.bf8.f32(float %x, float %y, i32 %old, i1 false) ret i32 %ret } -; GCN-LABEL: {{^}}test_cvt_pk_bf8_f32_word1: -; GCN: v_cvt_pk_bf8_f32 v2, v0, v1 op_sel:[0,0,1] -; GCN: v_mov_b32_e32 v0, v2 define i32 @test_cvt_pk_bf8_f32_word1(float %x, float %y, i32 %old) { +; GFX940-LABEL: test_cvt_pk_bf8_f32_word1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_pk_bf8_f32 v2, v0, v1 op_sel:[0,0,1] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_pk_bf8_f32_word1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_pk_bf8_f32 v2, v0, v1 op_sel:[0,0,1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call i32 @llvm.amdgcn.cvt.pk.bf8.f32(float %x, float %y, i32 %old, i1 true) ret i32 %ret } -; GCN-LABEL: {{^}}test_cvt_pk_fp8_f32_word0: -; GCN: v_cvt_pk_fp8_f32 v2, v0, v1{{$}} -; GCN: v_mov_b32_e32 v0, v2 define i32 @test_cvt_pk_fp8_f32_word0(float %x, float %y, i32 %old) { +; GFX940-LABEL: test_cvt_pk_fp8_f32_word0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_pk_fp8_f32 v2, v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_pk_fp8_f32_word0: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_pk_fp8_f32 v2, v0, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call i32 @llvm.amdgcn.cvt.pk.fp8.f32(float %x, float %y, i32 %old, i1 false) ret i32 %ret } -; GCN-LABEL: {{^}}test_cvt_pk_fp8_f32_word1: -; GCN: v_cvt_pk_fp8_f32 v2, v0, v1 op_sel:[0,0,1] -; GCN: v_mov_b32_e32 v0, v2 define i32 @test_cvt_pk_fp8_f32_word1(float %x, float %y, i32 %old) { +; GFX940-LABEL: test_cvt_pk_fp8_f32_word1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_pk_fp8_f32 v2, v0, v1 op_sel:[0,0,1] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_pk_fp8_f32_word1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_pk_fp8_f32 v2, v0, v1 op_sel:[0,0,1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call i32 @llvm.amdgcn.cvt.pk.fp8.f32(float %x, float %y, i32 %old, i1 true) ret i32 %ret } -; GCN-LABEL: {{^}}test_cvt_sr_bf8_f32_byte0: -; GCN: v_cvt_sr_bf8_f32 v2, v0, v1{{$}} -; GCN: v_mov_b32_e32 v0, v2 define i32 @test_cvt_sr_bf8_f32_byte0(float %x, i32 %r, i32 %old) { +; GFX940-LABEL: test_cvt_sr_bf8_f32_byte0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_sr_bf8_f32 v2, v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_sr_bf8_f32_byte0: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_sr_bf8_f32 v2, v0, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %x, i32 %r, i32 %old, i32 0) ret i32 %ret } -; GCN-LABEL: {{^}}test_cvt_sr_bf8_f32_byte1: -; GCN: v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,1,0] -; GCN: v_mov_b32_e32 v0, v2 define i32 @test_cvt_sr_bf8_f32_byte1(float %x, i32 %r, i32 %old) { +; GFX940-LABEL: test_cvt_sr_bf8_f32_byte1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_sr_bf8_f32_byte1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,1,0] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %x, i32 %r, i32 %old, i32 1) ret i32 %ret } -; GCN-LABEL: {{^}}test_cvt_sr_bf8_f32_byte2: -; GCN: v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,0,1] -; GCN: v_mov_b32_e32 v0, v2 define i32 @test_cvt_sr_bf8_f32_byte2(float %x, i32 %r, i32 %old) { +; GFX940-LABEL: test_cvt_sr_bf8_f32_byte2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,0,1] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_sr_bf8_f32_byte2: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,0,1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %x, i32 %r, i32 %old, i32 2) ret i32 %ret } -; GCN-LABEL: {{^}}test_cvt_sr_bf8_f32_byte3: -; GCN: v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,1,1] -; GCN: v_mov_b32_e32 v0, v2 define i32 @test_cvt_sr_bf8_f32_byte3(float %x, i32 %r, i32 %old) { +; GFX940-LABEL: test_cvt_sr_bf8_f32_byte3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,1,1] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_sr_bf8_f32_byte3: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,1,1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %x, i32 %r, i32 %old, i32 3) ret i32 %ret } -; GCN-LABEL: {{^}}test_cvt_sr_fp8_f32_byte0: -; GCN: v_cvt_sr_fp8_f32 v2, v0, v1{{$}} -; GCN: v_mov_b32_e32 v0, v2 define i32 @test_cvt_sr_fp8_f32_byte0(float %x, i32 %r, i32 %old) { +; GFX940-LABEL: test_cvt_sr_fp8_f32_byte0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_sr_fp8_f32_byte0: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %x, i32 %r, i32 %old, i32 0) ret i32 %ret } -; GCN-LABEL: {{^}}test_cvt_sr_fp8_f32_byte1: -; GCN: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,1,0] -; GCN: v_mov_b32_e32 v0, v2 define i32 @test_cvt_sr_fp8_f32_byte1(float %x, i32 %r, i32 %old) { +; GFX940-LABEL: test_cvt_sr_fp8_f32_byte1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_sr_fp8_f32_byte1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,1,0] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %x, i32 %r, i32 %old, i32 1) ret i32 %ret } -; GCN-LABEL: {{^}}test_cvt_sr_fp8_f32_byte2: -; GCN: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,0,1] -; GCN: v_mov_b32_e32 v0, v2 define i32 @test_cvt_sr_fp8_f32_byte2(float %x, i32 %r, i32 %old) { +; GFX940-LABEL: test_cvt_sr_fp8_f32_byte2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,0,1] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_sr_fp8_f32_byte2: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,0,1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %x, i32 %r, i32 %old, i32 2) ret i32 %ret } -; GCN-LABEL: {{^}}test_cvt_sr_fp8_f32_byte3: -; GCN: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,1,1] -; GCN: v_mov_b32_e32 v0, v2 define i32 @test_cvt_sr_fp8_f32_byte3(float %x, i32 %r, i32 %old) { +; GFX940-LABEL: test_cvt_sr_fp8_f32_byte3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,1,1] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_sr_fp8_f32_byte3: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,1,1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %x, i32 %r, i32 %old, i32 3) ret i32 %ret } diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll index 42c1e0558c2d3d..afa914c8375f64 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll @@ -38,8 +38,10 @@ define amdgpu_kernel void @workgroup_ids_kernel() { ; ; GFX12-SDAG-LABEL: workgroup_ids_kernel: ; GFX12-SDAG: ; %bb.0: ; %.entry -; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-SDAG-NEXT: s_and_b32 s0, ttmp7, 0xffff +; GFX12-SDAG-NEXT: s_lshr_b32 s1, ttmp7, 16 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, s0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s1 ; GFX12-SDAG-NEXT: buffer_store_b96 v[0:2], off, s[0:3], null ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -47,6 +49,9 @@ define amdgpu_kernel void @workgroup_ids_kernel() { ; ; GFX12-GISEL-LABEL: workgroup_ids_kernel: ; GFX12-GISEL: ; %bb.0: ; %.entry +; GFX12-GISEL-NEXT: s_mov_b32 s0, ttmp9 +; GFX12-GISEL-NEXT: s_and_b32 s1, ttmp7, 0xffff +; GFX12-GISEL-NEXT: s_lshr_b32 s2, ttmp7, 16 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2 ; GFX12-GISEL-NEXT: buffer_store_b96 v[0:2], off, s[0:3], null @@ -177,28 +182,26 @@ define amdgpu_kernel void @caller() { ; ; GFX12-SDAG-LABEL: caller: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, s6 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, ttmp9 ; GFX12-SDAG-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX12-SDAG-NEXT: s_mov_b32 s15, callee@abs32@hi -; GFX12-SDAG-NEXT: s_mov_b32 s14, callee@abs32@lo +; GFX12-SDAG-NEXT: s_mov_b32 s7, callee@abs32@hi +; GFX12-SDAG-NEXT: s_mov_b32 s6, callee@abs32@lo ; GFX12-SDAG-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX12-SDAG-NEXT: s_mov_b64 s[8:9], s[2:3] -; GFX12-SDAG-NEXT: s_mov_b32 s12, s6 ; GFX12-SDAG-NEXT: s_mov_b32 s32, 0 -; GFX12-SDAG-NEXT: s_swappc_b64 s[30:31], s[14:15] +; GFX12-SDAG-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: caller: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, s6 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, ttmp9 ; GFX12-GISEL-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX12-GISEL-NEXT: s_mov_b32 s14, callee@abs32@lo -; GFX12-GISEL-NEXT: s_mov_b32 s15, callee@abs32@hi +; GFX12-GISEL-NEXT: s_mov_b32 s6, callee@abs32@lo +; GFX12-GISEL-NEXT: s_mov_b32 s7, callee@abs32@hi ; GFX12-GISEL-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX12-GISEL-NEXT: s_mov_b64 s[8:9], s[2:3] -; GFX12-GISEL-NEXT: s_mov_b32 s12, s6 ; GFX12-GISEL-NEXT: s_mov_b32 s32, 0 -; GFX12-GISEL-NEXT: s_swappc_b64 s[30:31], s[14:15] +; GFX12-GISEL-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX12-GISEL-NEXT: s_endpgm %idx = call i32 @llvm.amdgcn.workgroup.id.x() call void @callee(i32 %idx) #0 @@ -261,8 +264,11 @@ define void @workgroup_ids_device_func(ptr addrspace(1) %outx, ptr addrspace(1) ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v6, s12 :: v_dual_mov_b32 v7, s13 -; GFX12-NEXT: v_mov_b32_e32 v8, s14 +; GFX12-NEXT: s_and_b32 s0, ttmp7, 0xffff +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v6, ttmp9 :: v_dual_mov_b32 v7, s0 +; GFX12-NEXT: s_lshr_b32 s1, ttmp7, 16 +; GFX12-NEXT: v_mov_b32_e32 v8, s1 ; GFX12-NEXT: global_store_b32 v[0:1], v6, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_store_b32 v[2:3], v7, off scope:SCOPE_SYS diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll index 8f279260b4bd8b..cfff0a969da9e7 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll @@ -33,12 +33,28 @@ define amdgpu_cs void @_amdgpu_cs_main() { ; GFX9ARCH-GISEL-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 ; GFX9ARCH-GISEL-NEXT: s_endpgm ; -; GFX12-LABEL: _amdgpu_cs_main: -; GFX12: ; %bb.0: ; %.entry -; GFX12-NEXT: buffer_store_b96 v[0:2], off, s[0:3], null -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm +; GFX12-SDAG-LABEL: _amdgpu_cs_main: +; GFX12-SDAG: ; %bb.0: ; %.entry +; GFX12-SDAG-NEXT: s_and_b32 s0, ttmp7, 0xffff +; GFX12-SDAG-NEXT: s_lshr_b32 s1, ttmp7, 16 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, s0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-SDAG-NEXT: buffer_store_b96 v[0:2], off, s[0:3], null +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: _amdgpu_cs_main: +; GFX12-GISEL: ; %bb.0: ; %.entry +; GFX12-GISEL-NEXT: s_mov_b32 s0, ttmp9 +; GFX12-GISEL-NEXT: s_and_b32 s1, ttmp7, 0xffff +; GFX12-GISEL-NEXT: s_lshr_b32 s2, ttmp7, 16 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-GISEL-NEXT: buffer_store_b96 v[0:2], off, s[0:3], null +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm .entry: %idx = call i32 @llvm.amdgcn.workgroup.id.x() %idy = call i32 @llvm.amdgcn.workgroup.id.y() @@ -112,6 +128,7 @@ define amdgpu_cs void @caller() { ; ; GFX12-SDAG-LABEL: caller: ; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, ttmp9 ; GFX12-SDAG-NEXT: s_mov_b32 s1, callee@abs32@hi ; GFX12-SDAG-NEXT: s_mov_b32 s0, callee@abs32@lo ; GFX12-SDAG-NEXT: s_mov_b32 s32, 0 @@ -120,6 +137,7 @@ define amdgpu_cs void @caller() { ; ; GFX12-GISEL-LABEL: caller: ; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, ttmp9 ; GFX12-GISEL-NEXT: s_mov_b32 s0, callee@abs32@lo ; GFX12-GISEL-NEXT: s_mov_b32 s1, callee@abs32@hi ; GFX12-GISEL-NEXT: s_mov_b32 s32, 0 diff --git a/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll b/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll index 71a225184af69b..df618e331dcf3d 100644 --- a/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll @@ -5,43 +5,24 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s define amdgpu_kernel void @workgroup_id_x(ptr addrspace(1) %ptrx) { -; GFX9-SDAG-LABEL: workgroup_id_x: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, ttmp9 -; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-SDAG-NEXT: s_endpgm -; -; GFX9-GISEL-LABEL: workgroup_id_x: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, ttmp9 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] -; GFX9-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: workgroup_id_x: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, ttmp9 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm +; GFX9-LABEL: workgroup_id_x: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, ttmp9 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: s_endpgm ; -; GFX12-GISEL-LABEL: workgroup_id_x: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, 0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm +; GFX12-LABEL: workgroup_id_x: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %idx = call i32 @llvm.amdgcn.workgroup.id.x() store i32 %idx, ptr addrspace(1) %ptrx @@ -52,22 +33,23 @@ define amdgpu_kernel void @workgroup_id_xy(ptr addrspace(1) %ptrx, ptr addrspace ; GFX9-LABEL: workgroup_id_xy: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v1, ttmp9 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, ttmp9 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, ttmp7 -; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9-NEXT: global_store_dword v2, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: workgroup_id_xy: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, ttmp9 -; GFX12-NEXT: v_mov_b32_e32 v2, ttmp7 +; GFX12-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, ttmp7 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: global_store_b32 v0, v2, s[2:3] +; GFX12-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX12-NEXT: global_store_b32 v2, v1, s[2:3] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -80,37 +62,21 @@ define amdgpu_kernel void @workgroup_id_xy(ptr addrspace(1) %ptrx, ptr addrspace } define amdgpu_kernel void @workgroup_id_xyz(ptr addrspace(1) %ptrx, ptr addrspace(1) %ptry, ptr addrspace(1) %ptrz) { -; GFX9-SDAG-LABEL: workgroup_id_xyz: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, ttmp9 -; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-SDAG-NEXT: s_and_b32 s0, ttmp7, 0xffff -; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-SDAG-NEXT: s_lshr_b32 s0, ttmp7, 16 -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[2:3] -; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[6:7] -; GFX9-SDAG-NEXT: s_endpgm -; -; GFX9-GISEL-LABEL: workgroup_id_xyz: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, ttmp9 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] -; GFX9-GISEL-NEXT: s_and_b32 s0, ttmp7, 0xffff -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-GISEL-NEXT: s_lshr_b32 s0, ttmp7, 16 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[2:3] -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[6:7] -; GFX9-GISEL-NEXT: s_endpgm +; GFX9-LABEL: workgroup_id_xyz: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, ttmp9 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_and_b32 s6, ttmp7, 0xffff +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: s_lshr_b32 s0, ttmp7, 16 +; GFX9-NEXT: global_store_dword v1, v0, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: workgroup_id_xyz: ; GFX12: ; %bb.0: @@ -118,15 +84,15 @@ define amdgpu_kernel void @workgroup_id_xyz(ptr addrspace(1) %ptrx, ptr addrspac ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 ; GFX12-NEXT: s_and_b32 s2, ttmp7, 0xffff -; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, ttmp9 +; GFX12-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, 0 ; GFX12-NEXT: s_lshr_b32 s3, ttmp7, 16 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x2 -; GFX12-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX12-NEXT: global_store_b32 v0, v2, s[6:7] -; GFX12-NEXT: global_store_b32 v0, v3, s[0:1] +; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX12-NEXT: global_store_b32 v1, v2, s[6:7] +; GFX12-NEXT: global_store_b32 v1, v3, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -144,5 +110,7 @@ declare i32 @llvm.amdgcn.workgroup.id.x() declare i32 @llvm.amdgcn.workgroup.id.y() declare i32 @llvm.amdgcn.workgroup.id.z() ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX12-GISEL: {{.*}} +; GFX12-SDAG: {{.*}} ; GFX9-GISEL: {{.*}} ; GFX9-SDAG: {{.*}} diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s index 35411ee0ba2a5e..c9c4fceffaeb0c 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s @@ -397,6 +397,51 @@ v_ctz_i32_b32 v5, src_scc v_ctz_i32_b32 v255, 0xaf123456 // GFX12: encoding: [0xff,0x74,0xfe,0x7f,0x56,0x34,0x12,0xaf] +v_cvt_f32_bf8_e32 v1, s3 +// GFX12: encoding: [0x03,0xda,0x02,0x7e] + +v_cvt_f32_bf8_e32 v1, 3 +// GFX12: encoding: [0x83,0xda,0x02,0x7e] + +v_cvt_f32_bf8_e32 v1, v3 +// GFX12: encoding: [0x03,0xdb,0x02,0x7e] + +v_cvt_f32_fp8_e32 v1, s3 +// GFX12: encoding: [0x03,0xd8,0x02,0x7e] + +v_cvt_f32_fp8_e32 v1, 3 +// GFX12: encoding: [0x83,0xd8,0x02,0x7e] + +v_cvt_f32_fp8_e32 v1, v3 +// GFX12: encoding: [0x03,0xd9,0x02,0x7e] + +v_cvt_pk_f32_bf8_e32 v[2:3], s3 +// GFX12: encoding: [0x03,0xde,0x04,0x7e] + +v_cvt_pk_f32_bf8_e32 v[3:4], s5 +// GFX12: encoding: [0x05,0xde,0x06,0x7e] + +v_cvt_pk_f32_bf8_e32 v[2:3], 3 +// GFX12: encoding: [0x83,0xde,0x04,0x7e] + +v_cvt_pk_f32_bf8_e32 v[3:4], 3 +// GFX12: encoding: [0x83,0xde,0x06,0x7e] + +v_cvt_pk_f32_bf8_e32 v[2:3], v3 +// GFX12: encoding: [0x03,0xdf,0x04,0x7e] + +v_cvt_pk_f32_bf8_e32 v[3:4], v3 +// GFX12: encoding: [0x03,0xdf,0x06,0x7e] + +v_cvt_pk_f32_fp8_e32 v[2:3], s3 +// GFX12: encoding: [0x03,0xdc,0x04,0x7e] + +v_cvt_pk_f32_fp8_e32 v[2:3], 3 +// GFX12: encoding: [0x83,0xdc,0x04,0x7e] + +v_cvt_pk_f32_fp8_e32 v[2:3], v3 +// GFX12: encoding: [0x03,0xdd,0x04,0x7e] + v_cvt_f16_f32 v5, v1 // GFX12: encoding: [0x01,0x15,0x0a,0x7e] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s index dd6afb28c396a7..5e0e1b688bc582 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s @@ -337,6 +337,18 @@ v_ctz_i32_b32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_ctz_i32_b32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: encoding: [0xfa,0x74,0xfe,0x7f,0xff,0x6f,0x05,0x30] +v_cvt_f32_fp8 v1, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xc +// GFX12: encoding: [0xfa,0xd8,0x02,0x7e,0x03,0xe4,0x00,0xac] + +v_cvt_f32_fp8 v1, v3 quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xe +// GFX12: encoding: [0xfa,0xd8,0x02,0x7e,0x03,0x1b,0x00,0x2e] + +v_cvt_f32_bf8 v1, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xc +// GFX12: encoding: [0xfa,0xda,0x02,0x7e,0x03,0xe4,0x00,0xac] + +v_cvt_f32_bf8 v1, v3 quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xe +// GFX12: encoding: [0xfa,0xda,0x02,0x7e,0x03,0x1b,0x00,0x2e] + v_cvt_f16_f32 v5, v1 quad_perm:[3,2,1,0] // GFX12: encoding: [0xfa,0x14,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s index 6530de0268456d..36c89710ce8f89 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s @@ -73,6 +73,18 @@ v_ctz_i32_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_ctz_i32_b32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: encoding: [0xe9,0x74,0xfe,0x7f,0xff,0x00,0x00,0x00] +v_cvt_f32_fp8 v5, v1 dpp8:[0,1,2,3,4,5,6,7] +// GFX12: encoding: [0xe9,0xd8,0x0a,0x7e,0x01,0x88,0xc6,0xfa] + +v_cvt_f32_fp8 v1, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0xe9,0xd8,0x02,0x7e,0x03,0x77,0x39,0x05] + +v_cvt_f32_bf8 v5, v1 dpp8:[0,1,2,3,4,5,6,7] +// GFX12: encoding: [0xe9,0xda,0x0a,0x7e,0x01,0x88,0xc6,0xfa] + +v_cvt_f32_bf8 v1, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0xe9,0xda,0x02,0x7e,0x03,0x77,0x39,0x05] + v_cvt_f16_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX12: encoding: [0xe9,0x14,0x0a,0x7e,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s index cf3f9c45bdcc8e..beb57999b855ea 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s @@ -1099,6 +1099,42 @@ v_cubetc_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 v_cubetc_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 // GFX12: encoding: [0xff,0x83,0x0e,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf] +v_cvt_pk_fp8_f32 v1, v2, v3 +// GFX12: encoding: [0x01,0x00,0x69,0xd7,0x02,0x07,0x02,0x00] + +v_cvt_pk_fp8_f32 v1, -v2, |v3| +// GFX12: encoding: [0x01,0x02,0x69,0xd7,0x02,0x07,0x02,0x20] + +v_cvt_pk_fp8_f32 v1, s2, 3 +// GFX12: encoding: [0x01,0x00,0x69,0xd7,0x02,0x06,0x01,0x00] + +v_cvt_pk_bf8_f32 v1, v2, v3 +// GFX12: encoding: [0x01,0x00,0x6a,0xd7,0x02,0x07,0x02,0x00] + +v_cvt_pk_bf8_f32 v1, -v2, |v3| +// GFX12: encoding: [0x01,0x02,0x6a,0xd7,0x02,0x07,0x02,0x20] + +v_cvt_pk_bf8_f32 v1, s2, 3 +// GFX12: encoding: [0x01,0x00,0x6a,0xd7,0x02,0x06,0x01,0x00] + +v_cvt_sr_fp8_f32 v1, v2, v3 +// GFX12: encoding: [0x01,0x00,0x6b,0xd7,0x02,0x07,0x02,0x00] + +v_cvt_sr_fp8_f32 v10, s2, v5 +// GFX12: encoding: [0x0a,0x00,0x6b,0xd7,0x02,0x0a,0x02,0x00] + +v_cvt_sr_fp8_f32 v5, -|v255|, v4 +// GFX12: encoding: [0x05,0x01,0x6b,0xd7,0xff,0x09,0x02,0x20] + +v_cvt_sr_bf8_f32 v1, v2, v3 +// GFX12: encoding: [0x01,0x00,0x6c,0xd7,0x02,0x07,0x02,0x00] + +v_cvt_sr_bf8_f32 v10, s2, v5 +// GFX12: encoding: [0x0a,0x00,0x6c,0xd7,0x02,0x0a,0x02,0x00] + +v_cvt_sr_bf8_f32 v5, -|v255|, v4 +// GFX12: encoding: [0x05,0x01,0x6c,0xd7,0xff,0x09,0x02,0x20] + v_cvt_pk_i16_f32 v5, v1, v2 // GFX12: encoding: [0x05,0x00,0x06,0xd7,0x01,0x05,0x02,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s index 26f63102df9508..df3430f376f69e 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s @@ -1015,6 +1015,114 @@ v_cubetc_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_m v_cubetc_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: [0xff,0x87,0x0e,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30] +v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[0,1,2,3] +// GFX12: encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0xe4,0x00,0xff] + +v_cvt_pk_bf8_f32_e64_dpp v6, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x06,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_pk_bf8_f32_e64_dpp v1, -v6, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed] + +v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v255| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x01,0x02,0x6a,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed] + +v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd +// GFX12: encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d] + +v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5 +// GFX12: encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5] + +v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 +// GFX12: encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed] + +v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[0,1,2,3] +// GFX12: encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0xe4,0x00,0xff] + +v_cvt_pk_fp8_f32_e64_dpp v6, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x06,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_pk_fp8_f32_e64_dpp v1, -v6, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed] + +v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v255| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x01,0x02,0x69,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed] + +v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd +// GFX12: encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d] + +v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5 +// GFX12: encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5] + +v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 +// GFX12: encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed] + +v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf +// GFX12: encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0xe4,0x00,0xff] + +v_cvt_sr_bf8_f32_e64_dpp v6, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x06,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_sr_bf8_f32_e64_dpp v1, -v6, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed] + +v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v255 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x01,0x00,0x6c,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed] + +v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd +// GFX12: encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d] + +v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5 +// GFX12: encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5] + +v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 +// GFX12: encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed] + +v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf +// GFX12: encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0xe4,0x00,0xff] + +v_cvt_sr_fp8_f32_e64_dpp v6, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x06,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_sr_fp8_f32_e64_dpp v1, -v6, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed] + +v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v255 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x01,0x00,0x6b,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed] + +v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd +// GFX12: encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d] + +v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5 +// GFX12: encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5] + +v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 +// GFX12: encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed] + v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] // GFX12: [0x05,0x00,0x06,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s index de294b1ff2a22a..09dd6df618c5b6 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s @@ -570,6 +570,54 @@ v_cubetc_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cubetc_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: [0xff,0x87,0x0e,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] +v_cvt_pk_fp8_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,2,3,0,1] +// GFX12: encoding: [0x05,0x00,0x69,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0xa9,0x21] + +v_cvt_pk_fp8_f32_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0x05,0x01,0x69,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +v_cvt_pk_fp8_f32_e64_dpp v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: encoding: [0x05,0x02,0x69,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cvt_pk_fp8_f32_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] +// GFX12: encoding: [0xff,0x03,0x69,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +v_cvt_pk_bf8_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0x05,0x00,0x6a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cvt_pk_bf8_f32_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0x05,0x01,0x6a,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +v_cvt_pk_bf8_f32_e64_dpp v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: encoding: [0x05,0x02,0x6a,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cvt_pk_bf8_f32_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] +// GFX12: encoding: [0xff,0x03,0x6a,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +v_cvt_sr_fp8_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0x05,0x00,0x6b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cvt_sr_fp8_f32_e64_dpp v5, |v1|, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0x05,0x01,0x6b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cvt_sr_fp8_f32_e64_dpp v5, -v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0x05,0x00,0x6b,0xd7,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cvt_sr_fp8_f32_e64_dpp v255, -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] +// GFX12: encoding: [0xff,0x01,0x6b,0xd7,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00] + +v_cvt_sr_bf8_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0x05,0x00,0x6c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cvt_sr_bf8_f32_e64_dpp v5, |v1|, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0x05,0x01,0x6c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cvt_sr_bf8_f32_e64_dpp v5, -v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0x05,0x00,0x6c,0xd7,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cvt_sr_bf8_f32_e64_dpp v255, -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] +// GFX12: encoding: [0xff,0x01,0x6c,0xd7,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00] + v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX12: [0x05,0x00,0x06,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s index e35bb632906722..7ee60262a5c1b4 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s @@ -396,6 +396,144 @@ v_ctz_i32_b32_e64 v5, src_scc v_ctz_i32_b32_e64 v255, 0xaf123456 // GFX12: encoding: [0xff,0x00,0xba,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] +v_cvt_f32_bf8_e64 v1, s3 +// GFX12: encoding: [0x01,0x00,0xed,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_f32_bf8_e64 v1, s3 op_sel:[0,1] +// GFX12: encoding: [0x01,0x10,0xed,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_f32_bf8_e64 v1, s3 op_sel:[1,0] +// GFX12: encoding: [0x01,0x08,0xed,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_f32_bf8_e64 v1, s3 op_sel:[1,1] +// GFX12: encoding: [0x01,0x18,0xed,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_f32_bf8_e64 v1, 3 +// GFX12: encoding: [0x01,0x00,0xed,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_f32_bf8_e64 v1, 3 op_sel:[0,1] +// GFX12: encoding: [0x01,0x10,0xed,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_f32_bf8_e64 v1, 3 op_sel:[1,0] +// GFX12: encoding: [0x01,0x08,0xed,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_f32_bf8_e64 v1, 3 op_sel:[1,1] +// GFX12: encoding: [0x01,0x18,0xed,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_f32_bf8_e64 v1, v3 +// GFX12: encoding: [0x01,0x00,0xed,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_f32_bf8_e64 v1, v3 op_sel:[0,1] +// GFX12: encoding: [0x01,0x10,0xed,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_f32_bf8_e64 v1, v3 op_sel:[1,0] +// GFX12: encoding: [0x01,0x08,0xed,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_f32_bf8_e64 v1, v3 op_sel:[1,1] +// GFX12: encoding: [0x01,0x18,0xed,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, s3 +// GFX12: encoding: [0x01,0x00,0xec,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, s3 op_sel:[0,1] +// GFX12: encoding: [0x01,0x10,0xec,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, s3 op_sel:[1,0] +// GFX12: encoding: [0x01,0x08,0xec,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, s3 op_sel:[1,1] +// GFX12: encoding: [0x01,0x18,0xec,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, 3 +// GFX12: encoding: [0x01,0x00,0xec,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, 3 op_sel:[0,1] +// GFX12: encoding: [0x01,0x10,0xec,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, 3 op_sel:[1,0] +// GFX12: encoding: [0x01,0x08,0xec,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, 3 op_sel:[1,1] +// GFX12: encoding: [0x01,0x18,0xec,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, v3 +// GFX12: encoding: [0x01,0x00,0xec,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, v3 op_sel:[0,1] +// GFX12: encoding: [0x01,0x10,0xec,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, v3 op_sel:[1,0] +// GFX12: encoding: [0x01,0x08,0xec,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, v3 op_sel:[1,1] +// GFX12: encoding: [0x01,0x18,0xec,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_pk_f32_bf8_e64 v[2:3], s3 +// GFX12: encoding: [0x02,0x00,0xef,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_pk_f32_bf8_e64 v[2:3], s3 op_sel:[1,0] +// GFX12: encoding: [0x02,0x08,0xef,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_pk_f32_bf8_e64 v[2:3], 3 +// GFX12: encoding: [0x02,0x00,0xef,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_pk_f32_bf8_e64 v[2:3], 3 op_sel:[1,0] +// GFX12: encoding: [0x02,0x08,0xef,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_pk_f32_bf8_e64 v[2:3], v3 +// GFX12: encoding: [0x02,0x00,0xef,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_pk_f32_bf8_e64 v[2:3], v3 op_sel:[1,0] +// GFX12: encoding: [0x02,0x08,0xef,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_pk_f32_fp8_e64 v[2:3], s3 +// GFX12: encoding: [0x02,0x00,0xee,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_pk_f32_fp8_e64 v[2:3], s3 op_sel:[1,0] +// GFX12: encoding: [0x02,0x08,0xee,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_pk_f32_fp8_e64 v[2:3], 3 +// GFX12: encoding: [0x02,0x00,0xee,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_pk_f32_fp8_e64 v[2:3], 3 op_sel:[1,0] +// GFX12: encoding: [0x02,0x08,0xee,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_pk_f32_fp8_e64 v[2:3], v3 +// GFX12: encoding: [0x02,0x00,0xee,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_pk_f32_fp8_e64 v[2:3], v3 op_sel:[1,0] +// GFX12: encoding: [0x02,0x08,0xee,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_pk_f32_bf8_e64 v[3:4], s3 +// GFX12: encoding: [0x03,0x00,0xef,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_pk_f32_bf8_e64 v[3:4], s3 op_sel:[1,0] +// GFX12: encoding: [0x03,0x08,0xef,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_pk_f32_bf8_e64 v[3:4], 3 op_sel:[1,0] +// GFX12: encoding: [0x03,0x08,0xef,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_pk_f32_bf8_e64 v[3:4], v3 +// GFX12: encoding: [0x03,0x00,0xef,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_pk_f32_bf8_e64 v[3:4], v3 op_sel:[1,0] +// GFX12: encoding: [0x03,0x08,0xef,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_pk_f32_fp8_e64 v[3:4], s3 +// GFX12: encoding: [0x03,0x00,0xee,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_pk_f32_fp8_e64 v[3:4], 3 +// GFX12: encoding: [0x03,0x00,0xee,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_pk_f32_fp8_e64 v[3:4], 3 op_sel:[1,0] +// GFX12: encoding: [0x03,0x08,0xee,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_pk_f32_fp8_e64 v[3:4], v3 +// GFX12: encoding: [0x03,0x00,0xee,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_pk_f32_fp8_e64 v[3:4], v3 op_sel:[1,0] +// GFX12: encoding: [0x03,0x08,0xee,0xd5,0x03,0x01,0x00,0x00] + v_cvt_f16_f32_e64 v5, v1 // GFX12: encoding: [0x05,0x00,0x8a,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s index 6b915bd14683a2..808f941197c427 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s @@ -336,6 +336,18 @@ v_ctz_i32_b32_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 v_ctz_i32_b32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: [0xff,0x00,0xba,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30] +V_CVT_F32_FP8_e64_dpp v5, v1 quad_perm:[3,1,2,0] row_mask:0x2 bank_mask:0xd +// GFX12: encoding: [0x05,0x00,0xec,0xd5,0xfa,0x00,0x00,0x00,0x01,0x27,0x00,0x2d] + +V_CVT_F32_FP8_e64_dpp v1, v3 quad_perm:[2,1,0,3] row_mask:0x5 bank_mask:0xe +// GFX12: encoding: [0x01,0x00,0xec,0xd5,0xfa,0x00,0x00,0x00,0x03,0xc6,0x00,0x5e] + +V_CVT_F32_BF8_e64_dpp v5, v1 quad_perm:[0,3,2,1] row_mask:0x2 bank_mask:0xd +// GFX12: encoding: [0x05,0x00,0xed,0xd5,0xfa,0x00,0x00,0x00,0x01,0x6c,0x00,0x2d] + +V_CVT_F32_BF8_e64_dpp v1, v3 quad_perm:[0,1,3,2] row_mask:0x5 bank_mask:0xe +// GFX12: encoding: [0x01,0x00,0xed,0xd5,0xfa,0x00,0x00,0x00,0x03,0xb4,0x00,0x5e] + v_cvt_f16_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX12: [0x05,0x00,0x8a,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s index 61266f3776c284..f7b51cfb6bda8e 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s @@ -84,6 +84,18 @@ v_ctz_i32_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_ctz_i32_b32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: [0xff,0x00,0xba,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00] +v_cvt_f32_fp8_e64_dpp v5, v1 dpp8:[0,1,2,3,4,5,6,7] +// GFX12: encoding: [0x05,0x00,0xec,0xd5,0xe9,0x00,0x00,0x00,0x01,0x88,0xc6,0xfa] + +v_cvt_f32_fp8_e64_dpp v1, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0x01,0x00,0xec,0xd5,0xe9,0x00,0x00,0x00,0x03,0x77,0x39,0x05] + +v_cvt_f32_bf8_e64_dpp v5, v1 dpp8:[0,1,2,3,4,5,6,7] +// GFX12: encoding: [0x05,0x00,0xed,0xd5,0xe9,0x00,0x00,0x00,0x01,0x88,0xc6,0xfa] + +v_cvt_f32_bf8_e64_dpp v1, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0x01,0x00,0xed,0xd5,0xe9,0x00,0x00,0x00,0x03,0x77,0x39,0x05] + v_cvt_f16_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX12: [0x05,0x00,0x8a,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1.txt index a839f03c42ba19..39bb7338c80748 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1.txt @@ -397,6 +397,42 @@ # GFX12: v_ctz_i32_b32_e32 v255, 0xaf123456 ; encoding: [0xff,0x74,0xfe,0x7f,0x56,0x34,0x12,0xaf] 0xff,0x74,0xfe,0x7f,0x56,0x34,0x12,0xaf +# GFX12: v_cvt_f32_bf8_e32 v1, s3 ; encoding: [0x03,0xda,0x02,0x7e] +0x03,0xda,0x02,0x7e + +# GFX12: v_cvt_f32_bf8_e32 v1, 3 ; encoding: [0x83,0xda,0x02,0x7e] +0x83,0xda,0x02,0x7e + +# GFX12: v_cvt_f32_bf8_e32 v1, v3 ; encoding: [0x03,0xdb,0x02,0x7e] +0x03,0xdb,0x02,0x7e + +# GFX12: v_cvt_f32_fp8_e32 v1, s3 ; encoding: [0x03,0xd8,0x02,0x7e] +0x03,0xd8,0x02,0x7e + +# GFX12: v_cvt_f32_fp8_e32 v1, 3 ; encoding: [0x83,0xd8,0x02,0x7e] +0x83,0xd8,0x02,0x7e + +# GFX12: v_cvt_f32_fp8_e32 v1, v3 ; encoding: [0x03,0xd9,0x02,0x7e] +0x03,0xd9,0x02,0x7e + +# GFX12: v_cvt_pk_f32_bf8_e32 v[2:3], s3 ; encoding: [0x03,0xde,0x04,0x7e] +0x03,0xde,0x04,0x7e + +# GFX12: v_cvt_pk_f32_bf8_e32 v[2:3], 3 ; encoding: [0x83,0xde,0x04,0x7e] +0x83,0xde,0x04,0x7e + +# GFX12: v_cvt_pk_f32_bf8_e32 v[2:3], v3 ; encoding: [0x03,0xdf,0x04,0x7e] +0x03,0xdf,0x04,0x7e + +# GFX12: v_cvt_pk_f32_fp8_e32 v[2:3], s3 ; encoding: [0x03,0xdc,0x04,0x7e] +0x03,0xdc,0x04,0x7e + +# GFX12: v_cvt_pk_f32_fp8_e32 v[2:3], 3 ; encoding: [0x83,0xdc,0x04,0x7e] +0x83,0xdc,0x04,0x7e + +# GFX12: v_cvt_pk_f32_fp8_e32 v[2:3], v3 ; encoding: [0x03,0xdd,0x04,0x7e] +0x03,0xdd,0x04,0x7e + # GFX12: v_cvt_f16_f32_e32 v5, v1 ; encoding: [0x01,0x15,0x0a,0x7e] 0x01,0x15,0x0a,0x7e diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt index bcb9ad9febb96d..5848333f41ef7c 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt @@ -337,6 +337,18 @@ # GFX12: v_ctz_i32_b32_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x74,0xfe,0x7f,0xff,0x6f,0x0d,0x30] 0xfa,0x74,0xfe,0x7f,0xff,0x6f,0x0d,0x30 +# GFX12: v_cvt_f32_fp8_dpp v1, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xc ; encoding: [0xfa,0xd8,0x02,0x7e,0x03,0xe4,0x00,0xac] +0xfa,0xd8,0x02,0x7e,0x03,0xe4,0x00,0xac + +# GFX12: v_cvt_f32_fp8_dpp v1, v3 quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xe ; encoding: [0xfa,0xd8,0x02,0x7e,0x03,0x1b,0x00,0x2e] +0xfa,0xd8,0x02,0x7e,0x03,0x1b,0x00,0x2e + +# GFX12: v_cvt_f32_bf8_dpp v1, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xc ; encoding: [0xfa,0xda,0x02,0x7e,0x03,0xe4,0x00,0xac] +0xfa,0xda,0x02,0x7e,0x03,0xe4,0x00,0xac + +# GFX12: v_cvt_f32_bf8_dpp v1, v3 quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xe ; encoding: [0xfa,0xda,0x02,0x7e,0x03,0x1b,0x00,0x2e] +0xfa,0xda,0x02,0x7e,0x03,0x1b,0x00,0x2e + # GFX12: v_cvt_f16_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x14,0x0a,0x7e,0x01,0x1b,0x00,0xff] 0xfa,0x14,0x0a,0x7e,0x01,0x1b,0x00,0xff diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt index 928165997dd919..d42e9ae25039bc 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt @@ -49,6 +49,18 @@ # GFX12: v_ctz_i32_b32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x74,0xfe,0x7f,0xff,0x00,0x00,0x00] 0xea,0x74,0xfe,0x7f,0xff,0x00,0x00,0x00 +# GFX12: v_cvt_f32_fp8_dpp v5, v1 dpp8:[0,1,2,3,4,5,6,7] ; encoding: [0xe9,0xd8,0x0a,0x7e,0x01,0x88,0xc6,0xfa] +0xe9,0xd8,0x0a,0x7e,0x01,0x88,0xc6,0xfa + +# GFX12: v_cvt_f32_fp8_dpp v1, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd8,0x02,0x7e,0x03,0x77,0x39,0x05] +0xe9,0xd8,0x02,0x7e,0x03,0x77,0x39,0x05 + +# GFX12: v_cvt_f32_bf8_dpp v5, v1 dpp8:[0,1,2,3,4,5,6,7] ; encoding: [0xe9,0xda,0x0a,0x7e,0x01,0x88,0xc6,0xfa] +0xe9,0xda,0x0a,0x7e,0x01,0x88,0xc6,0xfa + +# GFX12: v_cvt_f32_bf8_dpp v1, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xda,0x02,0x7e,0x03,0x77,0x39,0x05] +0xe9,0xda,0x02,0x7e,0x03,0x77,0x39,0x05 + # GFX12: v_cvt_f16_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x14,0x0a,0x7e,0x01,0x77,0x39,0x05] 0xe9,0x14,0x0a,0x7e,0x01,0x77,0x39,0x05 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt index db690aa99e4ab7..f86903b8de44b7 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt @@ -993,6 +993,42 @@ # GFX12: v_cubetc_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x0e,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf] 0xff,0x83,0x0e,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf +# GFX12: v_cvt_pk_fp8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x69,0xd7,0x02,0x07,0x02,0x00] +0x01,0x00,0x69,0xd7,0x02,0x07,0x02,0x00 + +# GFX12: v_cvt_pk_fp8_f32 v1, -v2, |v3| ; encoding: [0x01,0x02,0x69,0xd7,0x02,0x07,0x02,0x20] +0x01,0x02,0x69,0xd7,0x02,0x07,0x02,0x20 + +# GFX12: v_cvt_pk_fp8_f32 v1, s2, 3 ; encoding: [0x01,0x00,0x69,0xd7,0x02,0x06,0x01,0x00] +0x01,0x00,0x69,0xd7,0x02,0x06,0x01,0x00 + +# GFX12: v_cvt_pk_bf8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x6a,0xd7,0x02,0x07,0x02,0x00] +0x01,0x00,0x6a,0xd7,0x02,0x07,0x02,0x00 + +# GFX12: v_cvt_pk_bf8_f32 v1, -v2, |v3| ; encoding: [0x01,0x02,0x6a,0xd7,0x02,0x07,0x02,0x20] +0x01,0x02,0x6a,0xd7,0x02,0x07,0x02,0x20 + +# GFX12: v_cvt_pk_bf8_f32 v1, s2, 3 ; encoding: [0x01,0x00,0x6a,0xd7,0x02,0x06,0x01,0x00] +0x01,0x00,0x6a,0xd7,0x02,0x06,0x01,0x00 + +# GFX12: v_cvt_sr_fp8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x6b,0xd7,0x02,0x07,0x02,0x00] +0x01,0x00,0x6b,0xd7,0x02,0x07,0x02,0x00 + +# GFX12: v_cvt_sr_fp8_f32 v10, s2, v5 ; encoding: [0x0a,0x00,0x6b,0xd7,0x02,0x0a,0x02,0x00] +0x0a,0x00,0x6b,0xd7,0x02,0x0a,0x02,0x00 + +# GFX12: v_cvt_sr_fp8_f32 v5, -|v255|, v4 ; encoding: [0x05,0x01,0x6b,0xd7,0xff,0x09,0x02,0x20] +0x05,0x01,0x6b,0xd7,0xff,0x09,0x02,0x20 + +# GFX12: v_cvt_sr_bf8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x6c,0xd7,0x02,0x07,0x02,0x00] +0x01,0x00,0x6c,0xd7,0x02,0x07,0x02,0x00 + +# GFX12: v_cvt_sr_bf8_f32 v10, s2, v5 ; encoding: [0x0a,0x00,0x6c,0xd7,0x02,0x0a,0x02,0x00] +0x0a,0x00,0x6c,0xd7,0x02,0x0a,0x02,0x00 + +# GFX12: v_cvt_sr_bf8_f32 v5, -|v255|, v4 ; encoding: [0x05,0x01,0x6c,0xd7,0xff,0x09,0x02,0x20] +0x05,0x01,0x6c,0xd7,0xff,0x09,0x02,0x20 + # GFX12: v_cvt_pk_i16_f32 v5, v1, v2 ; encoding: [0x05,0x00,0x06,0xd7,0x01,0x05,0x02,0x00] 0x05,0x00,0x06,0xd7,0x01,0x05,0x02,0x00 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt index 69f61c7eb8030f..1be1d6e91ad8a7 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt @@ -825,6 +825,114 @@ # GFX12: v_cubetc_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x0e,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30] 0xff,0x87,0x0e,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30 +# GFX12: v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] +0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed + +# GFX12: v_cvt_pk_bf8_f32 v1, -v2, |v3| ; encoding: [0x01,0x02,0x6a,0xd7,0x02,0x07,0x02,0x20] +0x01,0x02,0x6a,0xd7,0x02,0x07,0x02,0x20 + +# GFX12: v_cvt_pk_bf8_f32_e64_dpp v6, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x06,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] +0x06,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed + +# GFX12: v_cvt_pk_bf8_f32_e64_dpp v1, -v6, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed] +0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed + +# GFX12: v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v255| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x6a,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed] +0x01,0x02,0x6a,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed + +# GFX12: v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed] +0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed + +# GFX12: v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd ; encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d] +0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d + +# GFX12: v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5 ; encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5] +0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5 + +# GFX12: v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 ; encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed] +0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed + +# GFX12: v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] +0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed + +# GFX12: v_cvt_pk_fp8_f32 v1, -v2, |v3| ; encoding: [0x01,0x02,0x69,0xd7,0x02,0x07,0x02,0x20] +0x01,0x02,0x69,0xd7,0x02,0x07,0x02,0x20 + +# GFX12: v_cvt_pk_fp8_f32_e64_dpp v6, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x06,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] +0x06,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed + +# GFX12: v_cvt_pk_fp8_f32_e64_dpp v1, -v6, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed] +0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed + +# GFX12: v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v255| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed] +0x01,0x02,0x69,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed + +# GFX12: v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed] +0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed + +# GFX12: v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d] +0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d + +# GFX12: v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5 ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5] +0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5 + +# GFX12: v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed] +0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed + +# GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] +0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed + +# GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0xe4,0x00,0xff] +0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0xe4,0x00,0xff + +# GFX12: v_cvt_sr_bf8_f32_e64_dpp v6, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x06,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] +0x06,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed + +# GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, -v6, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed] +0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed + +# GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v255 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x00,0x6c,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed] +0x01,0x00,0x6c,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed + +# GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed] +0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed + +# GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd ; encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d] +0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d + +# GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5 ; encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5] +0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5 + +# GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 ; encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed] +0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed + +# GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] +0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed + +# GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0xe4,0x00,0xff] +0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0xe4,0x00,0xff + +# GFX12: v_cvt_sr_fp8_f32_e64_dpp v6, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x06,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] +0x06,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed + +# GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, -v6, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed] +0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed + +# GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v255 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x00,0x6b,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed] +0x01,0x00,0x6b,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed + +# GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed] +0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed + +# GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd ; encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d] +0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d + +# GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5 ; encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5] +0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5 + +# GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 ; encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed] +0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed + # GFX12: v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x06,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] 0x05,0x00,0x06,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt index a7f0183016147f..44b3f7594029fd 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt @@ -495,6 +495,54 @@ # GFX12: v_cubetc_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x0e,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] 0xff,0x87,0x0e,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00 +# GFX12: v_cvt_pk_fp8_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,2,3,0,1] ; encoding: [0x05,0x00,0x69,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0xa9,0x21] +0x05,0x00,0x69,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0xa9,0x21 + +# GFX12: v_cvt_pk_fp8_f32_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x69,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +0x05,0x01,0x69,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05 + +# GFX12: v_cvt_pk_fp8_f32_e64_dpp v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x69,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +0x05,0x02,0x69,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05 + +# GFX12: v_cvt_pk_fp8_f32_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x69,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +0xff,0x03,0x69,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00 + +# GFX12: v_cvt_pk_bf8_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x05,0x00,0x6a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 + +# GFX12: v_cvt_pk_bf8_f32_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6a,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +0x05,0x01,0x6a,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05 + +# GFX12: v_cvt_pk_bf8_f32_e64_dpp v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x6a,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +0x05,0x02,0x6a,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05 + +# GFX12: v_cvt_pk_bf8_f32_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x6a,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +0xff,0x03,0x6a,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00 + +# GFX12: v_cvt_sr_fp8_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x05,0x00,0x6b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 + +# GFX12: v_cvt_sr_fp8_f32_e64_dpp v5, |v1|, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x05,0x01,0x6b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 + +# GFX12: v_cvt_sr_fp8_f32_e64_dpp v5, -v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd7,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +0x05,0x00,0x6b,0xd7,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05 + +# GFX12: v_cvt_sr_fp8_f32_e64_dpp v255, -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x01,0x6b,0xd7,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00] +0xff,0x01,0x6b,0xd7,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00 + +# GFX12: v_cvt_sr_bf8_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x05,0x00,0x6c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 + +# GFX12: v_cvt_sr_bf8_f32_e64_dpp v5, |v1|, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x05,0x01,0x6c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 + +# GFX12: v_cvt_sr_bf8_f32_e64_dpp v5, -v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6c,0xd7,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +0x05,0x00,0x6c,0xd7,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05 + +# GFX12: v_cvt_sr_bf8_f32_e64_dpp v255, -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x01,0x6c,0xd7,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00] +0xff,0x01,0x6c,0xd7,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00 + # GFX12: v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x06,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x05,0x00,0x06,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt index 4fe4284e8eb4e6..9a8368a65f3d37 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt @@ -396,6 +396,42 @@ # GFX12: v_ctz_i32_b32_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0xba,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] 0xff,0x00,0xba,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf +# GFX12: v_cvt_f32_bf8_e64 v1, s3 ; encoding: [0x01,0x00,0xed,0xd5,0x03,0x00,0x00,0x00] +0x01,0x00,0xed,0xd5,0x03,0x00,0x00,0x00 + +# GFX12: v_cvt_f32_bf8_e64 v1, 3 ; encoding: [0x01,0x00,0xed,0xd5,0x83,0x00,0x00,0x00] +0x01,0x00,0xed,0xd5,0x83,0x00,0x00,0x00 + +# GFX12: v_cvt_f32_bf8_e64 v1, v3 ; encoding: [0x01,0x00,0xed,0xd5,0x03,0x01,0x00,0x00] +0x01,0x00,0xed,0xd5,0x03,0x01,0x00,0x00 + +# GFX12: v_cvt_f32_fp8_e64 v1, s3 ; encoding: [0x01,0x00,0xec,0xd5,0x03,0x00,0x00,0x00] +0x01,0x00,0xec,0xd5,0x03,0x00,0x00,0x00 + +# GFX12: v_cvt_f32_fp8_e64 v1, 3 ; encoding: [0x01,0x00,0xec,0xd5,0x83,0x00,0x00,0x00] +0x01,0x00,0xec,0xd5,0x83,0x00,0x00,0x00 + +# GFX12: v_cvt_f32_fp8_e64 v1, v3 ; encoding: [0x01,0x00,0xec,0xd5,0x03,0x01,0x00,0x00] +0x01,0x00,0xec,0xd5,0x03,0x01,0x00,0x00 + +# GFX12: v_cvt_pk_f32_bf8_e64 v[2:3], s3 ; encoding: [0x02,0x00,0xef,0xd5,0x03,0x00,0x00,0x00] +0x02,0x00,0xef,0xd5,0x03,0x00,0x00,0x00 + +# GFX12: v_cvt_pk_f32_bf8_e64 v[2:3], 3 ; encoding: [0x02,0x00,0xef,0xd5,0x83,0x00,0x00,0x00] +0x02,0x00,0xef,0xd5,0x83,0x00,0x00,0x00 + +# GFX12: v_cvt_pk_f32_bf8_e64 v[2:3], v3 ; encoding: [0x02,0x00,0xef,0xd5,0x03,0x01,0x00,0x00] +0x02,0x00,0xef,0xd5,0x03,0x01,0x00,0x00 + +# GFX12: v_cvt_pk_f32_fp8_e64 v[2:3], s3 ; encoding: [0x02,0x00,0xee,0xd5,0x03,0x00,0x00,0x00] +0x02,0x00,0xee,0xd5,0x03,0x00,0x00,0x00 + +# GFX12: v_cvt_pk_f32_fp8_e64 v[2:3], 3 ; encoding: [0x02,0x00,0xee,0xd5,0x83,0x00,0x00,0x00] +0x02,0x00,0xee,0xd5,0x83,0x00,0x00,0x00 + +# GFX12: v_cvt_pk_f32_fp8_e64 v[2:3], v3 ; encoding: [0x02,0x00,0xee,0xd5,0x03,0x01,0x00,0x00] +0x02,0x00,0xee,0xd5,0x03,0x01,0x00,0x00 + # GFX12: v_cvt_f16_f32_e64 v5, v1 ; encoding: [0x05,0x00,0x8a,0xd5,0x01,0x01,0x00,0x00] 0x05,0x00,0x8a,0xd5,0x01,0x01,0x00,0x00 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt index e914d139e240e1..8af274e0b4028f 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt @@ -336,6 +336,18 @@ # GFX12: v_ctz_i32_b32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xba,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] 0xff,0x00,0xba,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30 +# GFX12: v_cvt_f32_fp8_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x2 bank_mask:0xd ; encoding: [0x05,0x00,0xec,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0x2d] +0x05,0x00,0xec,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0x2d + +# GFX12: v_cvt_f32_fp8_e64_dpp v1, v3 quad_perm:[0,2,1,1] row_mask:0x5 bank_mask:0xe ; encoding: [0x01,0x00,0xec,0xd5,0xfa,0x00,0x00,0x00,0x03,0x58,0x00,0x5e] +0x01,0x00,0xec,0xd5,0xfa,0x00,0x00,0x00,0x03,0x58,0x00,0x5e + +# GFX12: v_cvt_f32_bf8_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x2 bank_mask:0xd ; encoding: [0x05,0x00,0xed,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0x2d] +0x05,0x00,0xed,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0x2d + +# GFX12: v_cvt_f32_bf8_e64_dpp v1, v3 quad_perm:[0,2,1,1] row_mask:0x5 bank_mask:0xe ; encoding: [0x01,0x00,0xed,0xd5,0xfa,0x00,0x00,0x00,0x03,0x58,0x00,0x5e] +0x01,0x00,0xed,0xd5,0xfa,0x00,0x00,0x00,0x03,0x58,0x00,0x5e + # GFX12: v_cvt_f16_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x8a,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] 0x05,0x00,0x8a,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt index 2a4b677620d387..3d48d58c775b18 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt @@ -72,6 +72,18 @@ # GFX12: v_ctz_i32_b32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xba,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] 0xff,0x00,0xba,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00 +# GFX12: v_cvt_f32_fp8_e64_dpp v5, v1 dpp8:[0,1,2,3,4,5,6,7] ; encoding: [0x05,0x00,0xec,0xd5,0xe9,0x00,0x00,0x00,0x01,0x88,0xc6,0xfa] +0x05,0x00,0xec,0xd5,0xe9,0x00,0x00,0x00,0x01,0x88,0xc6,0xfa + +# GFX12: v_cvt_f32_fp8_e64_dpp v1, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x00,0xec,0xd5,0xe9,0x00,0x00,0x00,0x03,0x77,0x39,0x05] +0x01,0x00,0xec,0xd5,0xe9,0x00,0x00,0x00,0x03,0x77,0x39,0x05 + +# GFX12: v_cvt_f32_bf8_e64_dpp v5, v1 dpp8:[0,1,2,3,4,5,6,7] ; encoding: [0x05,0x00,0xed,0xd5,0xe9,0x00,0x00,0x00,0x01,0x88,0xc6,0xfa] +0x05,0x00,0xed,0xd5,0xe9,0x00,0x00,0x00,0x01,0x88,0xc6,0xfa + +# GFX12: v_cvt_f32_bf8_e64_dpp v1, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x00,0xed,0xd5,0xe9,0x00,0x00,0x00,0x03,0x77,0x39,0x05] +0x01,0x00,0xed,0xd5,0xe9,0x00,0x00,0x00,0x03,0x77,0x39,0x05 + # GFX12: v_cvt_f16_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x8a,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] 0x05,0x00,0x8a,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05