Skip to content

Commit

Permalink
[AMDGPU] Simplify, fix and improve known bits for mbcnt (#104768)
Browse files Browse the repository at this point in the history
Simplify by using KnownBits::add.

Fix GlobalISel path which was ignoring the known bits of src1.

Improve analysis of mbcnt.hi which adds at most 31 even in wave64.
  • Loading branch information
jayfoad authored Aug 19, 2024
1 parent 340fb65 commit 2258bc4
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 20 deletions.
31 changes: 17 additions & 14 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15758,16 +15758,12 @@ void SITargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
case Intrinsic::amdgcn_mbcnt_hi: {
const GCNSubtarget &ST =
DAG.getMachineFunction().getSubtarget<GCNSubtarget>();
// These return at most the (wavefront size - 1) + src1
// As long as src1 is an immediate we can calc known bits
KnownBits Src1Known = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
unsigned Src1ValBits = Src1Known.countMaxActiveBits();
unsigned MaxActiveBits = std::max(Src1ValBits, ST.getWavefrontSizeLog2());
// Cater for potential carry
MaxActiveBits += Src1ValBits ? 1 : 0;
unsigned Size = Op.getValueType().getSizeInBits();
if (MaxActiveBits < Size)
Known.Zero.setHighBits(Size - MaxActiveBits);
// Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
// most 31 + src1.
Known.Zero.setBitsFrom(
IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
Known = KnownBits::add(Known, Known2);
return;
}
}
Expand Down Expand Up @@ -15802,7 +15798,8 @@ void SITargetLowering::computeKnownBitsForTargetInstr(
switch (MI->getOpcode()) {
case AMDGPU::G_INTRINSIC:
case AMDGPU::G_INTRINSIC_CONVERGENT: {
switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
Intrinsic::ID IID = cast<GIntrinsic>(MI)->getIntrinsicID();
switch (IID) {
case Intrinsic::amdgcn_workitem_id_x:
knownBitsForWorkitemID(*getSubtarget(), KB, Known, 0);
break;
Expand All @@ -15814,9 +15811,15 @@ void SITargetLowering::computeKnownBitsForTargetInstr(
break;
case Intrinsic::amdgcn_mbcnt_lo:
case Intrinsic::amdgcn_mbcnt_hi: {
// These return at most the wavefront size - 1.
unsigned Size = MRI.getType(R).getSizeInBits();
Known.Zero.setHighBits(Size - getSubtarget()->getWavefrontSizeLog2());
// Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
// most 31 + src1.
Known.Zero.setBitsFrom(IID == Intrinsic::amdgcn_mbcnt_lo
? getSubtarget()->getWavefrontSizeLog2()
: 5);
KnownBits Known2;
KB.computeKnownBitsImpl(MI->getOperand(3).getReg(), Known2, DemandedElts,
Depth + 1);
Known = KnownBits::add(Known, Known2);
break;
}
case Intrinsic::amdgcn_groupstaticsize: {
Expand Down
23 changes: 17 additions & 6 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
; RUN: llc -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=verde < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=verde < %s | FileCheck -check-prefix=GCN %s

; GCN-LABEL: {{^}}mbcnt_intrinsics:
; GCN: v_mbcnt_lo_u32_b32{{(_e64)*}} [[LO:v[0-9]+]], -1, 0
; SI: v_mbcnt_hi_u32_b32_e32 {{v[0-9]+}}, -1, [[LO]]
; VI: v_mbcnt_hi_u32_b32 {{v[0-9]+}}, -1, [[LO]]
; GCN: v_mbcnt_lo_u32_b32_e64 [[LO:v[0-9]+]], -1, 0
; GCN: v_mbcnt_hi_u32_b32_e32 {{v[0-9]+}}, -1, [[LO]]
define amdgpu_ps void @mbcnt_intrinsics(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, i32 inreg %arg3) {
main_body:
%lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
Expand Down Expand Up @@ -80,13 +79,25 @@ define i32 @mbcnt_hi_known_bits_3(i32 %x) #0 {

; GCN-LABEL: {{^}}mbcnt_hi_known_bits_4:
; GCN: v_mbcnt_hi_u32_b32
; GCN: v_and_b32_e32
; GCN-NOT: v_and_b32_e32
define i32 @mbcnt_hi_known_bits_4(i32 %x) #0 {
%hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 %x, i32 15)
%mask = and i32 %hi, 63
ret i32 %mask
}

; TODO: Special case mbcnt.lo feeding into mbcnt.hi to remove this AND.
; GCN-LABEL: {{^}}mbcnt_lo_hi_known_bits_1:
; GCN: v_mbcnt_lo_u32_b32
; GCN: v_mbcnt_hi_u32_b32
; GCN: v_and_b32_e32
define i32 @mbcnt_lo_hi_known_bits_1(i32 %x) #0 {
%lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 %x, i32 0)
%hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 %x, i32 %lo)
%mask = and i32 %hi, 63
ret i32 %mask
}

declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0
declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #0
declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1
Expand Down

0 comments on commit 2258bc4

Please sign in to comment.