[AMDGPU] Fix regclass for operand subregs in SIInsertWaterfall

If operand is a register with subregister marking then getRegClass will get us larger class of full register. We need to check if a subreg is used instead of full register and pick only that subreg in any later instructions. Change-Id: I892f51f7668c250d33c811d705ea9a0e28b48478
jaebaek · Jul 21, 2020 · 44408c8 · 44408c8
1 parent 3dbeb7a
commit 44408c8
Show file tree

Hide file tree

Showing 2 changed files with 9 additions and 6 deletions.
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaterfall.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaterfall.cpp
@@ -162,7 +162,8 @@ static void readFirstLaneReg(MachineBasicBlock &MBB, MachineRegisterInfo *MRI,
 
   if (RegSize == 1)
     BuildMI(MBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RFLReg)
-        .addReg(RFLSrcReg, getUndefRegState(RFLSrcOp.isUndef()));
+        .addReg(RFLSrcReg, getUndefRegState(RFLSrcOp.isUndef()),
+                RFLSrcOp.getSubReg());
   else {
     SmallVector<unsigned, 8> TRegs;
     for (unsigned i = 0; i < RegSize; ++i) {
@@ -213,7 +214,8 @@ static unsigned compareIdx(MachineBasicBlock &MBB, MachineRegisterInfo *MRI,
   // Iterate over the index in dword chunks and'ing the result with the
   // CondReg
   unsigned IndexReg = IndexOp.getReg();
-  auto IndexRC = MRI->getRegClass(IndexReg);
+  auto IndexRC = RI->getSubRegClass(MRI->getRegClass(IndexOp.getReg()),
+                                    IndexOp.getSubReg());
   unsigned AndOpc =
       IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
   const auto *BoolXExecRC = RI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
@@ -485,7 +487,8 @@ bool SIInsertWaterfall::processWaterfall(MachineBasicBlock &MBB) {
     // Loop is ended after the last SI_WATERFALL_END and these instructions are
     // removed with the src replacing all dst uses
     auto Index = TII->getNamedOperand(*(Item.Begin), AMDGPU::OpName::idx);
-    auto IndexRC = MRI->getRegClass(Index->getReg());
+    auto IndexRC = RI->getSubRegClass(MRI->getRegClass(Index->getReg()),
+                                      Index->getSubReg());
 
     if (!RI->hasVGPRs(IndexRC)) {
       // Waterfall loop index is uniform! Loop can be removed

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.waterfall.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.waterfall.ll
@@ -10,7 +10,7 @@
 ; GCN-32: v_cmp_eq_u32_e64 [[EXEC:s[0-9]+]], [[VAL1]], [[VAL2]]
 ; GCN-64: s_and_saveexec_b64 [[EXEC]], [[EXEC]]
 ; GCN-32: s_and_saveexec_b32 [[EXEC]], [[EXEC]]
-; GCN: v_readlane_b32 [[RLVAL:s[0-9]+]], v1, [[VAL1]]
+; GCN: v_readlane_b32 [[RLVAL:s[0-9]+]], v{{[0-9]+}}, [[RLVAL]]
 ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[RLVAL]]
 ; GCN: v_or_b32_e32 [[ACCUM:v[0-9]+]], [[ACCUM]], [[VVAL]]
 ; GCN-64: s_xor_b64 exec, exec, [[EXEC]]
@@ -20,14 +20,14 @@
 ; GCN-32: s_mov_b32 exec_lo, s{{[0-9]+}}
 ; VI: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[ACCUM]]
 ; GFX9_UP: global_store_dword v[{{[0-9]+:[0-9]+}}], [[ACCUM]], off
-define amdgpu_ps void @test_waterfall_readlane(i32 addrspace(1)* inreg %out, <2 x i32> addrspace(1)* inreg %in, i32 %tid, i32 %val) #1 {
+define amdgpu_ps void @test_waterfall_readlane(i32 addrspace(1)* inreg %out, <2 x i32> addrspace(1)* inreg %in, i32 %tid) #1 {
   %gep.in = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 %tid
   %args = load <2 x i32>, <2 x i32> addrspace(1)* %gep.in
   %value = extractelement <2 x i32> %args, i32 0
   %lane = extractelement <2 x i32> %args, i32 1
   %wf_token = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 %lane)
   %readlane = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %wf_token, i32 %lane)
-  %readlane1 = call i32 @llvm.amdgcn.readlane(i32 %val, i32 %readlane)
+  %readlane1 = call i32 @llvm.amdgcn.readlane(i32 %value, i32 %readlane)
   %readlane2 = call i32 @llvm.amdgcn.waterfall.end.i32(i32 %wf_token, i32 %readlane1)
   ; This store instruction should be outside the waterfall loop and the value
   ; being stored generated incrementally in the loop itself