From 44408c8ca2064fed8e04baeae5fcedaac849ec65 Mon Sep 17 00:00:00 2001 From: Mirko Brkusanin Date: Fri, 17 Jul 2020 15:22:03 +0200 Subject: [PATCH] [AMDGPU] Fix regclass for operand subregs in SIInsertWaterfall If operand is a register with subregister marking then getRegClass will get us larger class of full register. We need to check if a subreg is used instead of full register and pick only that subreg in any later instructions. Change-Id: I892f51f7668c250d33c811d705ea9a0e28b48478 --- llvm/lib/Target/AMDGPU/SIInsertWaterfall.cpp | 9 ++++++--- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.waterfall.ll | 6 +++--- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaterfall.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaterfall.cpp index b4403c5af81525..58ec0c292fecbc 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaterfall.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaterfall.cpp @@ -162,7 +162,8 @@ static void readFirstLaneReg(MachineBasicBlock &MBB, MachineRegisterInfo *MRI, if (RegSize == 1) BuildMI(MBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RFLReg) - .addReg(RFLSrcReg, getUndefRegState(RFLSrcOp.isUndef())); + .addReg(RFLSrcReg, getUndefRegState(RFLSrcOp.isUndef()), + RFLSrcOp.getSubReg()); else { SmallVector TRegs; for (unsigned i = 0; i < RegSize; ++i) { @@ -213,7 +214,8 @@ static unsigned compareIdx(MachineBasicBlock &MBB, MachineRegisterInfo *MRI, // Iterate over the index in dword chunks and'ing the result with the // CondReg unsigned IndexReg = IndexOp.getReg(); - auto IndexRC = MRI->getRegClass(IndexReg); + auto IndexRC = RI->getSubRegClass(MRI->getRegClass(IndexOp.getReg()), + IndexOp.getSubReg()); unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; const auto *BoolXExecRC = RI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); @@ -485,7 +487,8 @@ bool SIInsertWaterfall::processWaterfall(MachineBasicBlock &MBB) { // Loop is ended after the last SI_WATERFALL_END and these instructions are // removed with the src replacing all dst uses auto Index = TII->getNamedOperand(*(Item.Begin), AMDGPU::OpName::idx); - auto IndexRC = MRI->getRegClass(Index->getReg()); + auto IndexRC = RI->getSubRegClass(MRI->getRegClass(Index->getReg()), + Index->getSubReg()); if (!RI->hasVGPRs(IndexRC)) { // Waterfall loop index is uniform! Loop can be removed diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.waterfall.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.waterfall.ll index 0c470fec707144..010588f838184a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.waterfall.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.waterfall.ll @@ -10,7 +10,7 @@ ; GCN-32: v_cmp_eq_u32_e64 [[EXEC:s[0-9]+]], [[VAL1]], [[VAL2]] ; GCN-64: s_and_saveexec_b64 [[EXEC]], [[EXEC]] ; GCN-32: s_and_saveexec_b32 [[EXEC]], [[EXEC]] -; GCN: v_readlane_b32 [[RLVAL:s[0-9]+]], v1, [[VAL1]] +; GCN: v_readlane_b32 [[RLVAL:s[0-9]+]], v{{[0-9]+}}, [[RLVAL]] ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[RLVAL]] ; GCN: v_or_b32_e32 [[ACCUM:v[0-9]+]], [[ACCUM]], [[VVAL]] ; GCN-64: s_xor_b64 exec, exec, [[EXEC]] @@ -20,14 +20,14 @@ ; GCN-32: s_mov_b32 exec_lo, s{{[0-9]+}} ; VI: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[ACCUM]] ; GFX9_UP: global_store_dword v[{{[0-9]+:[0-9]+}}], [[ACCUM]], off -define amdgpu_ps void @test_waterfall_readlane(i32 addrspace(1)* inreg %out, <2 x i32> addrspace(1)* inreg %in, i32 %tid, i32 %val) #1 { +define amdgpu_ps void @test_waterfall_readlane(i32 addrspace(1)* inreg %out, <2 x i32> addrspace(1)* inreg %in, i32 %tid) #1 { %gep.in = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 %tid %args = load <2 x i32>, <2 x i32> addrspace(1)* %gep.in %value = extractelement <2 x i32> %args, i32 0 %lane = extractelement <2 x i32> %args, i32 1 %wf_token = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 %lane) %readlane = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %wf_token, i32 %lane) - %readlane1 = call i32 @llvm.amdgcn.readlane(i32 %val, i32 %readlane) + %readlane1 = call i32 @llvm.amdgcn.readlane(i32 %value, i32 %readlane) %readlane2 = call i32 @llvm.amdgcn.waterfall.end.i32(i32 %wf_token, i32 %readlane1) ; This store instruction should be outside the waterfall loop and the value ; being stored generated incrementally in the loop itself