Skip to content

Commit

Permalink
[MachineLICM] Correctly Apply Register Masks (#95746)
Browse files Browse the repository at this point in the history
Fix regression introduced in d4b8b72
  • Loading branch information
Pierre-vh authored Jun 17, 2024
1 parent c2d9f25 commit 770393b
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 24 deletions.
35 changes: 13 additions & 22 deletions llvm/lib/CodeGen/MachineLICM.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -426,38 +426,29 @@ static bool InstructionStoresToFI(const MachineInstr *MI, int FI) {
static void applyBitsNotInRegMaskToRegUnitsMask(const TargetRegisterInfo &TRI,
BitVector &RUs,
const uint32_t *Mask) {
// Iterate over the RegMask raw to avoid constructing a BitVector, which is
// expensive as it implies dynamically allocating memory.
//
// We also work backwards.
BitVector ClobberedRUs(TRI.getNumRegUnits(), true);
const unsigned NumRegs = TRI.getNumRegs();
const unsigned MaskWords = (NumRegs + 31) / 32;
for (unsigned K = 0; K < MaskWords; ++K) {
// We want to set the bits that aren't in RegMask, so flip it.
uint32_t Word = ~Mask[K];

// Iterate all set bits, starting from the right.
while (Word) {
const unsigned SetBitIdx = countr_zero(Word);

// The bits are numbered from the LSB in each word.
const unsigned PhysReg = (K * 32) + SetBitIdx;

// Clear the bit at SetBitIdx. Doing it this way appears to generate less
// instructions on x86. This works because negating a number will flip all
// the bits after SetBitIdx. So (Word & -Word) == (1 << SetBitIdx), but
// faster.
Word ^= Word & -Word;
const uint32_t Word = Mask[K];
if (!Word)
continue;

for (unsigned Bit = 0; Bit < 32; ++Bit) {
const unsigned PhysReg = (K * 32) + Bit;
if (PhysReg == NumRegs)
return;
break;

if (PhysReg) {
// Check if we have a valid PhysReg that is set in the mask.
// FIXME: We shouldn't have to check for PhysReg.
if (PhysReg && ((Word >> Bit) & 1)) {
for (MCRegUnitIterator RUI(PhysReg, &TRI); RUI.isValid(); ++RUI)
RUs.set(*RUI);
ClobberedRUs.reset(*RUI);
}
}
}

RUs |= ClobberedRUs;
}

/// Examine the instruction for potentai LICM candidate. Also
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AMDGPU/indirect-call.ll
Original file line number Diff line number Diff line change
Expand Up @@ -886,12 +886,12 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
; GCN-NEXT: v_writelane_b32 v40, s62, 30
; GCN-NEXT: v_writelane_b32 v40, s63, 31
; GCN-NEXT: s_mov_b64 s[6:7], exec
; GCN-NEXT: s_movk_i32 s4, 0x7b
; GCN-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
; GCN-NEXT: v_readfirstlane_b32 s8, v0
; GCN-NEXT: v_readfirstlane_b32 s9, v1
; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
; GCN-NEXT: s_and_saveexec_b64 s[10:11], vcc
; GCN-NEXT: s_movk_i32 s4, 0x7b
; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9]
; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN-NEXT: s_xor_b64 exec, exec, s[10:11]
Expand Down Expand Up @@ -980,12 +980,12 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
; GISEL-NEXT: v_writelane_b32 v40, s62, 30
; GISEL-NEXT: v_writelane_b32 v40, s63, 31
; GISEL-NEXT: s_mov_b64 s[6:7], exec
; GISEL-NEXT: s_movk_i32 s4, 0x7b
; GISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
; GISEL-NEXT: v_readfirstlane_b32 s8, v0
; GISEL-NEXT: v_readfirstlane_b32 s9, v1
; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
; GISEL-NEXT: s_and_saveexec_b64 s[10:11], vcc
; GISEL-NEXT: s_movk_i32 s4, 0x7b
; GISEL-NEXT: s_swappc_b64 s[30:31], s[8:9]
; GISEL-NEXT: ; implicit-def: $vgpr0
; GISEL-NEXT: s_xor_b64 exec, exec, s[10:11]
Expand Down

0 comments on commit 770393b

Please sign in to comment.