[AMDGPU] Properly check op_sel in GCNDPPCombine #79122

mbrkusanin · 2024-01-23T11:37:16Z

No description provided.

llvmbot · 2024-01-23T11:37:50Z

@llvm/pr-subscribers-backend-amdgpu

Author: Mirko Brkušanin (mbrkusanin)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/79122.diff

2 Files Affected:

(modified) llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp (+20-8)
(modified) llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir (+75-4)

diff --git a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
index a75082268c7739f..94d28dc0a2c74a1 100644
--- a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
@@ -274,8 +274,8 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
       break;
     }
 
-    if (auto *Mod0 = TII->getNamedOperand(OrigMI,
-                                          AMDGPU::OpName::src0_modifiers)) {
+    auto *Mod0 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0_modifiers);
+    if (Mod0) {
       assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp,
                                           AMDGPU::OpName::src0_modifiers));
       assert(HasVOP3DPP ||
@@ -298,8 +298,8 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
     DPPInst->getOperand(NumOperands).setIsKill(false);
     ++NumOperands;
 
-    if (auto *Mod1 = TII->getNamedOperand(OrigMI,
-                                          AMDGPU::OpName::src1_modifiers)) {
+    auto *Mod1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1_modifiers);
+    if (Mod1) {
       assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp,
                                           AMDGPU::OpName::src1_modifiers));
       assert(HasVOP3DPP ||
@@ -330,8 +330,9 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
       DPPInst.add(*Src1);
       ++NumOperands;
     }
-    if (auto *Mod2 =
-            TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2_modifiers)) {
+
+    auto *Mod2 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2_modifiers);
+    if (Mod2) {
       assert(NumOperands ==
              AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::src2_modifiers));
       assert(HasVOP3DPP ||
@@ -350,6 +351,7 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
       DPPInst.add(*Src2);
       ++NumOperands;
     }
+
     if (HasVOP3DPP) {
       auto *ClampOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::clamp);
       if (ClampOpr && AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::clamp)) {
@@ -368,7 +370,13 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
       // all 1.
       if (auto *OpSelOpr =
               TII->getNamedOperand(OrigMI, AMDGPU::OpName::op_sel)) {
-        auto OpSel = OpSelOpr->getImm();
+        int64_t OpSel = 0;
+        OpSel |= (Mod0 ? (!!(Mod0->getImm() & SISrcMods::OP_SEL_0) << 0) : 0);
+        OpSel |= (Mod1 ? (!!(Mod1->getImm() & SISrcMods::OP_SEL_0) << 1) : 0);
+        OpSel |= (Mod2 ? (!!(Mod2->getImm() & SISrcMods::OP_SEL_0) << 2) : 0);
+        if (Mod0 && TII->isVOP3(OrigMI) && !TII->isVOP3P(OrigMI))
+          OpSel |= !!(Mod0->getImm() & SISrcMods::DST_OP_SEL) << 3;
+
         if (OpSel != 0) {
           LLVM_DEBUG(dbgs() << "  failed: op_sel must be zero\n");
           Fail = true;
@@ -379,7 +387,11 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
       }
       if (auto *OpSelHiOpr =
               TII->getNamedOperand(OrigMI, AMDGPU::OpName::op_sel_hi)) {
-        auto OpSelHi = OpSelHiOpr->getImm();
+        int64_t OpSelHi = 0;
+        OpSelHi |= (Mod0 ? (!!(Mod0->getImm() & SISrcMods::OP_SEL_1) << 0) : 0);
+        OpSelHi |= (Mod1 ? (!!(Mod1->getImm() & SISrcMods::OP_SEL_1) << 1) : 0);
+        OpSelHi |= (Mod2 ? (!!(Mod2->getImm() & SISrcMods::OP_SEL_1) << 2) : 0);
+
         // Only vop3p has op_sel_hi, and all vop3p have 3 operands, so check
         // the bitmask for 3 op_sel_hi bits set
         assert(Src2 && "Expected vop3p with 3 operands");
diff --git a/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir b/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir
index 89ea3ed12aea3b2..3df087256ad6e76 100644
--- a/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir
+++ b/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir
@@ -83,7 +83,8 @@ body:             |
 # Regression test for src_modifiers on base u16 opcode
 # GCN-label: name: vop3_u16
 # GCN: %5:vgpr_32 = V_ADD_NC_U16_e64_dpp %3, 0, %1, 0, %3, 0, 0, 1, 15, 15, 1, implicit $exec
-# GCN: %7:vgpr_32 = V_ADD_NC_U16_e64_dpp %3, 4, %5, 8, %5, 0, 0, 1, 15, 15, 1, implicit $exec
+# GCN: %7:vgpr_32 = V_ADD_NC_U16_e64_dpp %3, 1, %5, 2, %5, 0, 0, 1, 15, 15, 1, implicit $exec
+# GCN: %9:vgpr_32 = V_ADD_NC_U16_e64 4, %8, 8, %7, 0, 0, implicit $exec
 name: vop3_u16
 tracksRegLiveness: true
 body:             |
@@ -97,7 +98,9 @@ body:             |
     %4:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
     %5:vgpr_32 = V_ADD_NC_U16_e64 0, %4, 0, %3, 0, 0, implicit $exec
     %6:vgpr_32 = V_MOV_B32_dpp %3, %5, 1, 15, 15, 1, implicit $exec
-    %7:vgpr_32 = V_ADD_NC_U16_e64 4, %6, 8, %5, 0, 0, implicit $exec
+    %7:vgpr_32 = V_ADD_NC_U16_e64 1, %6, 2, %5, 0, 0, implicit $exec
+    %8:vgpr_32 = V_MOV_B32_dpp %3, %7, 1, 15, 15, 1, implicit $exec
+    %9:vgpr_32 = V_ADD_NC_U16_e64 4, %8, 8, %7, 0, 0, implicit $exec
 ...
 
 name: vop3p
@@ -116,7 +119,7 @@ body:             |
     ; GCN: [[V_DOT2_F32_F16_:%[0-9]+]]:vgpr_32 = V_DOT2_F32_F16 0, [[V_MOV_B32_dpp]], 0, [[COPY]], 0, [[COPY2]], 0, 5, 0, 0, 0, implicit $mode, implicit $exec
     ; GCN: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[DEF]], [[COPY1]], 1, 15, 15, 1, implicit $exec
     ; GCN: [[V_DOT2_F32_F16_1:%[0-9]+]]:vgpr_32 = V_DOT2_F32_F16 0, [[V_MOV_B32_dpp1]], 0, [[COPY]], 0, [[COPY2]], 0, 0, 4, 0, 0, implicit $mode, implicit $exec
-    ; GCN: [[V_DOT2_F32_F16_dpp:%[0-9]+]]:vgpr_32 = V_DOT2_F32_F16_dpp [[DEF]], 10, [[COPY1]], 8, [[COPY]], 13, [[COPY2]], 1, 0, 7, 4, 5, 1, 15, 15, 1, implicit $mode, implicit $exec
+    ; GCN: [[V_DOT2_F32_F16_dpp:%[0-9]+]]:vgpr_32 = V_DOT2_F32_F16_dpp [[DEF]], 10, [[COPY1]], 8, [[COPY]], 9, [[COPY2]], 1, 0, 7, 4, 5, 1, 15, 15, 1, implicit $mode, implicit $exec
     ; GCN: [[V_FMA_MIX_F32_dpp:%[0-9]+]]:vgpr_32 = V_FMA_MIX_F32_dpp [[DEF]], 8, [[COPY1]], 8, [[COPY]], 8, [[COPY2]], 1, 0, 7, 1, 15, 15, 1, implicit $mode, implicit $exec
     ; GCN: [[V_FMA_MIXLO_F16_dpp:%[0-9]+]]:vgpr_32 = V_FMA_MIXLO_F16_dpp [[DEF]], 8, [[COPY1]], 8, [[COPY]], 8, [[COPY2]], 0, [[COPY2]], 0, 7, 1, 15, 15, 1, implicit $mode, implicit $exec
     ; GCN: [[V_FMA_MIXHI_F16_dpp:%[0-9]+]]:vgpr_32 = V_FMA_MIXHI_F16_dpp [[DEF]], 8, [[COPY1]], 8, [[COPY]], 8, [[COPY2]], 1, [[COPY]], 0, 7, 1, 15, 15, 1, implicit $mode, implicit $exec
@@ -134,7 +137,7 @@ body:             |
     %7:vgpr_32 = V_DOT2_F32_F16 0, %6, 0, %0, 0, %2, 0, 0, 4, 0, 0, implicit $mode, implicit $exec
 
     %8:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
-    %9:vgpr_32 = V_DOT2_F32_F16 10, %8, 8, %0, 13, %2, 1, 0, 7, 4, 5, implicit $mode, implicit $exec
+    %9:vgpr_32 = V_DOT2_F32_F16 10, %8, 8, %0, 9, %2, 1, 0, 7, 4, 5, implicit $mode, implicit $exec
 
     %10:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
     %11:vgpr_32 = V_FMA_MIX_F32 8, %10, 8, %0, 8, %2, 1, 0, 7, implicit $mode, implicit $exec
@@ -871,3 +874,71 @@ body: |
     %5:vgpr_32 = V_ADD_U32_e32 %4.sub0, %4.sub0, implicit $exec
     %6:vgpr_32 = V_ADDC_U32_e32 %4.sub1, %4.sub1, implicit-def $vcc, implicit $vcc, implicit $exec
 ...
+
+# Check op_sel is all 0s when combining
+# GCN-LABEL: name: opsel_vop3
+# GCN: %4:vgpr_32 = V_ADD_I16_e64_dpp %2, 0, %0, 0, %1, 0, 0, 1, 15, 15, 1, implicit $exec
+# GCN: %6:vgpr_32 = V_ADD_I16_e64 4, %5, 0, %1, 0, 0, implicit $exec
+# GCN: %8:vgpr_32 = V_ADD_I16_e64 0, %7, 4, %1, 0, 0, implicit $exec
+# GCN: %10:vgpr_32 = V_ADD_I16_e64 4, %9, 4, %1, 0, 0, implicit $exec
+name:            opsel_vop3
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    %0:vgpr_32 = COPY $vgpr0
+    %1:vgpr_32 = COPY $vgpr1
+    %2:vgpr_32 = IMPLICIT_DEF
+
+    ; Combine for op_sel:[0,0,0]
+    %3:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 15, 15, 1, implicit $exec
+    %4:vgpr_32 = V_ADD_I16_e64 0, %3, 0, %1, 0, 0, implicit $exec
+
+    ; Do not combine for op_sel:[1,0,0]
+    %5:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 15, 15, 1, implicit $exec
+    %6:vgpr_32 = V_ADD_I16_e64 4, %5, 0, %1, 0, 0, implicit $exec
+
+    ; Do not combine for op_sel:[0,1,0]
+    %7:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 15, 15, 1, implicit $exec
+    %8:vgpr_32 = V_ADD_I16_e64 0, %7, 4, %1, 0, 0, implicit $exec
+
+    ; Do not combine for op_sel:[1,1,0]
+    %9:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 15, 15, 1, implicit $exec
+    %10:vgpr_32 = V_ADD_I16_e64 4, %9, 4, %1, 0, 0, implicit $exec
+...
+
+# Check op_sel is all 0s and op_sel_hi is all 1s when combining
+# GCN-LABEL: name: opsel_vop3p
+# GCN: %5:vgpr_32 = V_FMA_MIX_F32 0, %4, 0, %1, 0, %2, 0, 0, 0, implicit $mode, implicit $exec
+# GCN: %7:vgpr_32 = V_FMA_MIX_F32 4, %6, 4, %1, 4, %2, 0, 0, 0, implicit $mode, implicit $exec
+# GCN: %9:vgpr_32 = V_FMA_MIX_F32_dpp %3, 8, %0, 8, %1, 8, %2, 0, 0, 7, 1, 15, 15, 1, implicit $mode, implicit $exec
+# GCN: %11:vgpr_32 = V_FMA_MIX_F32 12, %10, 12, %1, 12, %2, 0, 0, 0, implicit $mode, implicit $exec
+
+name: opsel_vop3p
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    %0:vgpr_32 = COPY $vgpr0
+    %1:vgpr_32 = COPY $vgpr1
+    %2:vgpr_32 = COPY $vgpr2
+    %3:vgpr_32 = IMPLICIT_DEF
+
+    ; Do not combine for op_sel:[0,0,0] op_sel_hi:[0,0,0]
+    %4:vgpr_32 = V_MOV_B32_dpp %3, %0, 1, 15, 15, 1, implicit $exec
+    %5:vgpr_32 = V_FMA_MIX_F32 0, %4, 0, %1, 0, %2, 0, 0, 0, implicit $mode, implicit $exec
+
+    ; Do not combine for op_sel:[1,1,1] op_sel_hi:[0,0,0]
+    %6:vgpr_32 = V_MOV_B32_dpp %3, %0, 1, 15, 15, 1, implicit $exec
+    %7:vgpr_32 = V_FMA_MIX_F32 4, %6, 4, %1, 4, %2, 0, 0, 0, implicit $mode, implicit $exec
+
+    ; Combine for op_sel:[0,0,0] op_sel_hi:[1,1,1]
+    %8:vgpr_32 = V_MOV_B32_dpp %3, %0, 1, 15, 15, 1, implicit $exec
+    %9:vgpr_32 = V_FMA_MIX_F32 8, %8, 8, %1, 8, %2, 0, 0, 0, implicit $mode, implicit $exec
+
+    ; Do not combine for op_sel:[1,1,1] op_sel_hi:[1,1,1]
+    %10:vgpr_32 = V_MOV_B32_dpp %3, %0, 1, 15, 15, 1, implicit $exec
+    %11:vgpr_32 = V_FMA_MIX_F32 12, %10, 12, %1, 12, %2, 0, 0, 0, implicit $mode, implicit $exec
+...

Sisyph

Overall looks great, thanks for fixing it so fast. I think it needs a test to reject formation when DST_OP_SEL is 1, but otherwise LGTM.

mariusz-sikora-at-amd · 2024-01-23T15:56:47Z

Overall looks great, thanks for fixing it so fast. I think it needs a test to reject formation when DST_OP_SEL is 1, but otherwise LGTM.

I verified the change, we will have tests in these PR #78414
https://github.com/llvm/llvm-project/pull/78414/files#diff-4f33ce09676ec00dd307fa51d99e0277c4e65d89d8996fc9ed419eabbd17fe1fR7

mbrkusanin · 2024-01-23T16:00:18Z

Rebased and added test that has only DST_OP_SEL set to 1.

Sisyph

LGTM

llvmbot added the backend:AMDGPU label Jan 23, 2024

mbrkusanin mentioned this pull request Jan 23, 2024

[AMDGPU][GFX12] VOP encoding and codegen - add support for v_cvt fp8/… #78414

Merged

Sisyph reviewed Jan 23, 2024

View reviewed changes

mbrkusanin added 2 commits January 23, 2024 16:58

[AMDGPU] Properly check op_sel in GCNDPPCombine

7edc140

add test for vop3 with dst_op_sel

48935fd

mbrkusanin force-pushed the dppcombine-opsel branch from 4f120cb to 48935fd Compare January 23, 2024 15:58

Sisyph approved these changes Jan 23, 2024

View reviewed changes

mbrkusanin merged commit 6bb7d51 into llvm:main Jan 23, 2024
3 of 4 checks passed

mbrkusanin deleted the dppcombine-opsel branch January 24, 2024 15:40

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[AMDGPU] Properly check op_sel in GCNDPPCombine #79122

[AMDGPU] Properly check op_sel in GCNDPPCombine #79122

mbrkusanin commented Jan 23, 2024

llvmbot commented Jan 23, 2024

Sisyph left a comment

mariusz-sikora-at-amd commented Jan 23, 2024 •

edited

Loading

mbrkusanin commented Jan 23, 2024

Sisyph left a comment

[AMDGPU] Properly check op_sel in GCNDPPCombine #79122

[AMDGPU] Properly check op_sel in GCNDPPCombine #79122

Conversation

mbrkusanin commented Jan 23, 2024

llvmbot commented Jan 23, 2024

Sisyph left a comment

Choose a reason for hiding this comment

mariusz-sikora-at-amd commented Jan 23, 2024 • edited Loading

mbrkusanin commented Jan 23, 2024

Sisyph left a comment

Choose a reason for hiding this comment

mariusz-sikora-at-amd commented Jan 23, 2024 •

edited

Loading