-
Notifications
You must be signed in to change notification settings - Fork 12.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU] Implement IR variant of isFMAFasterThanFMulAndFAdd #121465
Conversation
@llvm/pr-subscribers-backend-amdgpu Author: Chinmay Deshpande (chinmaydd) ChangesFixes #108751 . Thanks @Shoreshen for helping out with the test case. Full diff: https://github.com/llvm/llvm-project/pull/121465.diff 3 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 58b061f5c1af0d..e610f2627d2cd8 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5728,6 +5728,33 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
return false;
}
+// Refer to comments added to the MIR variant of isFMAFasterThanFMulAndFAdd for
+// specific details.
+bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const Function &F,
+ Type *Ty) const {
+ SIModeRegisterDefaults Mode = SIModeRegisterDefaults(F, *Subtarget);
+ switch (Ty->getScalarSizeInBits()) {
+ case 32: {
+ if (!Subtarget->hasMadMacF32Insts())
+ return Subtarget->hasFastFMAF32();
+
+ if (Mode.FP32Denormals != DenormalMode::getPreserveSign())
+ return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
+
+ return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
+ }
+ case 64:
+ return true;
+ case 16:
+ return Subtarget->has16BitInsts() &&
+ Mode.FP64FP16Denormals != DenormalMode::getPreserveSign();
+ default:
+ break;
+ }
+
+ return false;
+}
+
bool SITargetLowering::isFMADLegal(const MachineInstr &MI, LLT Ty) const {
if (!Ty.isScalar())
return false;
@@ -16942,6 +16969,37 @@ bool SITargetLowering::checkForPhysRegDependency(
return false;
}
+/// Check if it is profitable to hoist instruction in then/else to if.
+/// Not profitable if I and it's user can form a FMA instruction
+/// because we prefer FMSUB/FMADD.
+bool SITargetLowering::isProfitableToHoist(Instruction *I) const {
+ if (!I->hasOneUse())
+ return true;
+
+ Instruction *User = I->user_back();
+ // TODO: Add more patterns that are not profitable to hoist
+ switch (I->getOpcode()) {
+ case Instruction::FMul: {
+ if (User->getOpcode() != Instruction::FSub &&
+ User->getOpcode() != Instruction::FAdd)
+ return true;
+
+ const TargetOptions &Options = getTargetMachine().Options;
+ const Function *F = I->getFunction();
+ const DataLayout &DL = F->getDataLayout();
+ Type *Ty = User->getOperand(0)->getType();
+
+ return !isOperationLegalOrCustom(ISD::FMA, getValueType(DL, Ty)) ||
+ (Options.AllowFPOpFusion != FPOpFusion::Fast &&
+ !Options.UnsafeFPMath) ||
+ !isFMAFasterThanFMulAndFAdd(*F, Ty);
+ }
+ default:
+ return true;
+ }
+ return true;
+}
+
void SITargetLowering::emitExpandAtomicAddrSpacePredicate(
Instruction *AI) const {
// Given: atomicrmw fadd ptr %addr, float %val ordering
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 631f26542bbe6d..731fb5d79a90d4 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -457,6 +457,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
EVT VT) const override;
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
const LLT Ty) const override;
+ bool isFMAFasterThanFMulAndFAdd(const Function &F, Type *Ty) const override;
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override;
bool isFMADLegal(const MachineInstr &MI, const LLT Ty) const override;
@@ -536,6 +537,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
const TargetInstrInfo *TII, unsigned &PhysReg,
int &Cost) const override;
+ bool isProfitableToHoist(Instruction *I) const override;
+
bool isKnownNeverNaNForTargetNode(SDValue Op,
const SelectionDAG &DAG,
bool SNaN = false,
diff --git a/llvm/test/CodeGen/AMDGPU/is-profitable-to-hoist-ir.ll b/llvm/test/CodeGen/AMDGPU/is-profitable-to-hoist-ir.ll
new file mode 100644
index 00000000000000..3c204fda38d458
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/is-profitable-to-hoist-ir.ll
@@ -0,0 +1,185 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=simplifycfg -verify-machineinstrs -hoist-common-insts=true -mtriple=amdgcn-- -mcpu=gfx1030 -fp-contract=fast < %s | FileCheck -check-prefix=GFX -check-prefix=GFX-FP-CONTRACT %s
+; RUN: opt -S -passes=simplifycfg -verify-machineinstrs -hoist-common-insts=true -mtriple=amdgcn-- -mcpu=gfx1030 -enable-unsafe-fp-math --denormal-fp-math=ieee < %s | FileCheck -check-prefix=GFX -check-prefix=GFX-UNSAFE-FP-IEEE %s
+; RUN: opt -S -passes=simplifycfg -verify-machineinstrs -hoist-common-insts=true -mtriple=amdgcn-- -mcpu=gfx1030 -enable-unsafe-fp-math --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX -check-prefix=GFX-UNSAFE-FP-PRESERVE %s
+
+define double @_branch(ptr dereferenceable(8) %x, ptr dereferenceable(8) %y, ptr dereferenceable(8) %a) #0 {
+; GFX-LABEL: define double @_branch(
+; GFX-SAME: ptr dereferenceable(8) [[X:%.*]], ptr dereferenceable(8) [[Y:%.*]], ptr dereferenceable(8) [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX-NEXT: [[ENTRY:.*:]]
+; GFX-NEXT: [[TMP0:%.*]] = load double, ptr [[Y]], align 8
+; GFX-NEXT: [[CMP:%.*]] = fcmp oeq double [[TMP0]], 0.000000e+00
+; GFX-NEXT: [[TMP1:%.*]] = load double, ptr [[X]], align 8
+; GFX-NEXT: [[TMP2:%.*]] = load double, ptr [[A]], align 8
+; GFX-NEXT: br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_ELSE:.*]]
+; GFX: [[COMMON_RET:.*]]:
+; GFX-NEXT: [[COMMON_RET_OP:%.*]] = phi double [ [[MUL:%.*]], %[[IF_THEN]] ], [ [[SUB:%.*]], %[[IF_ELSE]] ]
+; GFX-NEXT: ret double [[COMMON_RET_OP]]
+; GFX: [[IF_THEN]]:
+; GFX-NEXT: [[MUL]] = fmul fast double [[TMP1]], [[TMP2]]
+; GFX-NEXT: [[ADD:%.*]] = fadd fast double 1.000000e+00, [[MUL]]
+; GFX-NEXT: br label %[[COMMON_RET]]
+; GFX: [[IF_ELSE]]:
+; GFX-NEXT: [[MUL1:%.*]] = fmul fast double [[TMP1]], [[TMP2]]
+; GFX-NEXT: [[SUB]] = fsub fast double [[MUL1]], [[TMP0]]
+; GFX-NEXT: br label %[[COMMON_RET]]
+;
+entry:
+ %0 = load double, ptr %y, align 8
+ %cmp = fcmp oeq double %0, 0.000000e+00
+ %1 = load double, ptr %x, align 8
+ br i1 %cmp, label %if.then, label %if.else
+
+if.then: ; preds = %entry
+ %2 = load double, ptr %a, align 8
+ %mul = fmul fast double %1, %2
+ %add = fadd fast double 1.000000e+00, %mul
+ ret double %mul
+
+if.else: ; preds = %entry
+ %3 = load double, ptr %a, align 8
+ %mul1 = fmul fast double %1, %3
+ %sub = fsub fast double %mul1, %0
+ ret double %sub
+}
+
+define float @_branch2(ptr dereferenceable(8) %x, ptr dereferenceable(8) %y, ptr dereferenceable(8) %a) #0 {
+; GFX-LABEL: define float @_branch2(
+; GFX-SAME: ptr dereferenceable(8) [[X:%.*]], ptr dereferenceable(8) [[Y:%.*]], ptr dereferenceable(8) [[A:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: [[ENTRY:.*:]]
+; GFX-NEXT: [[TMP0:%.*]] = load float, ptr [[Y]], align 8
+; GFX-NEXT: [[CMP:%.*]] = fcmp oeq float [[TMP0]], 0.000000e+00
+; GFX-NEXT: [[TMP1:%.*]] = load float, ptr [[X]], align 8
+; GFX-NEXT: [[TMP2:%.*]] = load float, ptr [[A]], align 8
+; GFX-NEXT: br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_ELSE:.*]]
+; GFX: [[COMMON_RET:.*]]:
+; GFX-NEXT: [[COMMON_RET_OP:%.*]] = phi float [ [[MUL:%.*]], %[[IF_THEN]] ], [ [[SUB:%.*]], %[[IF_ELSE]] ]
+; GFX-NEXT: ret float [[COMMON_RET_OP]]
+; GFX: [[IF_THEN]]:
+; GFX-NEXT: [[MUL]] = fmul fast float [[TMP1]], [[TMP2]]
+; GFX-NEXT: [[ADD:%.*]] = fadd fast float 1.000000e+00, [[MUL]]
+; GFX-NEXT: br label %[[COMMON_RET]]
+; GFX: [[IF_ELSE]]:
+; GFX-NEXT: [[MUL1:%.*]] = fmul fast float [[TMP1]], [[TMP2]]
+; GFX-NEXT: [[SUB]] = fsub fast float [[MUL1]], [[TMP0]]
+; GFX-NEXT: br label %[[COMMON_RET]]
+;
+entry:
+ %0 = load float, ptr %y, align 8
+ %cmp = fcmp oeq float %0, 0.000000e+00
+ %1 = load float, ptr %x, align 8
+ br i1 %cmp, label %if.then, label %if.else
+
+
+if.then: ; preds = %entry
+ %2 = load float, ptr %a, align 8
+ %mul = fmul fast float %1, %2
+ %add = fadd fast float 1.000000e+00, %mul
+ ret float %mul
+
+if.else: ; preds = %entry
+ %3 = load float, ptr %a, align 8
+ %mul1 = fmul fast float %1, %3
+ %sub = fsub fast float %mul1, %0
+ ret float %sub
+}
+
+define half @_branch3(ptr dereferenceable(8) %x, ptr dereferenceable(8) %y, ptr dereferenceable(8) %a) #0 {
+; GFX-CONTRACT-LABEL: define half @_branchr32(
+; GFX-CONTRACT-SAME: ptr dereferenceable(8) [[X:%.*]], ptr dereferenceable(8) [[Y:%.*]], ptr dereferenceable(8) [[A:%.*]]) #[[ATTR0]] {
+; GFX-CONTRACT-NEXT: [[ENTRY:.*:]]
+; GFX-CONTRACT-NEXT: [[TMP0:%.*]] = load half, ptr [[Y]], align 8
+; GFX-CONTRACT-NEXT: [[CMP:%.*]] = fcmp oeq half [[TMP0]], 0xH0000
+; GFX-CONTRACT-NEXT: [[TMP1:%.*]] = load half, ptr [[X]], align 8
+; GFX-CONTRACT-NEXT: [[TMP2:%.*]] = load half, ptr [[A]], align 8
+; GFX-CONTRACT-NEXT: br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_ELSE:.*]]
+; GFX-CONTRACT: [[COMMON_RET:.*]]:
+; GFX-CONTRACT-NEXT: [[COMMON_RET_OP:%.*]] = phi half [ [[MUL:%.*]], %[[IF_THEN]] ], [ [[SUB:%.*]], %[[IF_ELSE]] ]
+; GFX-CONTRACT-NEXT: ret half [[COMMON_RET_OP]]
+; GFX-CONTRACT: [[IF_THEN]]:
+; GFX-CONTRACT-NEXT: [[MUL]] = fmul fast half [[TMP1]], [[TMP2]]
+; GFX-CONTRACT-NEXT: [[ADD:%.*]] = fadd fast half 0xH3C00, [[MUL]]
+; GFX-CONTRACT-NEXT: br label %[[COMMON_RET]]
+; GFX-CONTRACT: [[IF_ELSE]]:
+; GFX-CONTRACT-NEXT: [[MUL1:%.*]] = fmul fast half [[TMP1]], [[TMP2]]
+; GFX-CONTRACT-NEXT: [[SUB]] = fsub fast half [[MUL1]], [[TMP0]]
+; GFX-CONTRACT-NEXT: br label %[[COMMON_RET]]
+;
+; GFX-FP-CONTRACT-LABEL: define half @_branch3(
+; GFX-FP-CONTRACT-SAME: ptr dereferenceable(8) [[X:%.*]], ptr dereferenceable(8) [[Y:%.*]], ptr dereferenceable(8) [[A:%.*]]) #[[ATTR0]] {
+; GFX-FP-CONTRACT-NEXT: [[ENTRY:.*:]]
+; GFX-FP-CONTRACT-NEXT: [[TMP0:%.*]] = load half, ptr [[Y]], align 8
+; GFX-FP-CONTRACT-NEXT: [[CMP:%.*]] = fcmp oeq half [[TMP0]], 0xH0000
+; GFX-FP-CONTRACT-NEXT: [[TMP1:%.*]] = load half, ptr [[X]], align 8
+; GFX-FP-CONTRACT-NEXT: [[TMP2:%.*]] = load half, ptr [[A]], align 8
+; GFX-FP-CONTRACT-NEXT: br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_ELSE:.*]]
+; GFX-FP-CONTRACT: [[COMMON_RET:.*]]:
+; GFX-FP-CONTRACT-NEXT: [[COMMON_RET_OP:%.*]] = phi half [ [[MUL:%.*]], %[[IF_THEN]] ], [ [[SUB:%.*]], %[[IF_ELSE]] ]
+; GFX-FP-CONTRACT-NEXT: ret half [[COMMON_RET_OP]]
+; GFX-FP-CONTRACT: [[IF_THEN]]:
+; GFX-FP-CONTRACT-NEXT: [[MUL]] = fmul fast half [[TMP1]], [[TMP2]]
+; GFX-FP-CONTRACT-NEXT: [[ADD:%.*]] = fadd fast half 0xH3C00, [[MUL]]
+; GFX-FP-CONTRACT-NEXT: br label %[[COMMON_RET]]
+; GFX-FP-CONTRACT: [[IF_ELSE]]:
+; GFX-FP-CONTRACT-NEXT: [[MUL1:%.*]] = fmul fast half [[TMP1]], [[TMP2]]
+; GFX-FP-CONTRACT-NEXT: [[SUB]] = fsub fast half [[MUL1]], [[TMP0]]
+; GFX-FP-CONTRACT-NEXT: br label %[[COMMON_RET]]
+;
+; GFX-UNSAFE-FP-IEEE-LABEL: define half @_branch3(
+; GFX-UNSAFE-FP-IEEE-SAME: ptr dereferenceable(8) [[X:%.*]], ptr dereferenceable(8) [[Y:%.*]], ptr dereferenceable(8) [[A:%.*]]) #[[ATTR0]] {
+; GFX-UNSAFE-FP-IEEE-NEXT: [[ENTRY:.*:]]
+; GFX-UNSAFE-FP-IEEE-NEXT: [[TMP0:%.*]] = load half, ptr [[Y]], align 8
+; GFX-UNSAFE-FP-IEEE-NEXT: [[CMP:%.*]] = fcmp oeq half [[TMP0]], 0xH0000
+; GFX-UNSAFE-FP-IEEE-NEXT: [[TMP1:%.*]] = load half, ptr [[X]], align 8
+; GFX-UNSAFE-FP-IEEE-NEXT: [[TMP2:%.*]] = load half, ptr [[A]], align 8
+; GFX-UNSAFE-FP-IEEE-NEXT: br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_ELSE:.*]]
+; GFX-UNSAFE-FP-IEEE: [[COMMON_RET:.*]]:
+; GFX-UNSAFE-FP-IEEE-NEXT: [[COMMON_RET_OP:%.*]] = phi half [ [[MUL:%.*]], %[[IF_THEN]] ], [ [[SUB:%.*]], %[[IF_ELSE]] ]
+; GFX-UNSAFE-FP-IEEE-NEXT: ret half [[COMMON_RET_OP]]
+; GFX-UNSAFE-FP-IEEE: [[IF_THEN]]:
+; GFX-UNSAFE-FP-IEEE-NEXT: [[MUL]] = fmul fast half [[TMP1]], [[TMP2]]
+; GFX-UNSAFE-FP-IEEE-NEXT: [[ADD:%.*]] = fadd fast half 0xH3C00, [[MUL]]
+; GFX-UNSAFE-FP-IEEE-NEXT: br label %[[COMMON_RET]]
+; GFX-UNSAFE-FP-IEEE: [[IF_ELSE]]:
+; GFX-UNSAFE-FP-IEEE-NEXT: [[MUL1:%.*]] = fmul fast half [[TMP1]], [[TMP2]]
+; GFX-UNSAFE-FP-IEEE-NEXT: [[SUB]] = fsub fast half [[MUL1]], [[TMP0]]
+; GFX-UNSAFE-FP-IEEE-NEXT: br label %[[COMMON_RET]]
+;
+; GFX-UNSAFE-FP-PRESERVE-LABEL: define half @_branch3(
+; GFX-UNSAFE-FP-PRESERVE-SAME: ptr dereferenceable(8) [[X:%.*]], ptr dereferenceable(8) [[Y:%.*]], ptr dereferenceable(8) [[A:%.*]]) #[[ATTR0]] {
+; GFX-UNSAFE-FP-PRESERVE-NEXT: [[ENTRY:.*:]]
+; GFX-UNSAFE-FP-PRESERVE-NEXT: [[TMP0:%.*]] = load half, ptr [[Y]], align 8
+; GFX-UNSAFE-FP-PRESERVE-NEXT: [[CMP:%.*]] = fcmp oeq half [[TMP0]], 0xH0000
+; GFX-UNSAFE-FP-PRESERVE-NEXT: [[TMP1:%.*]] = load half, ptr [[X]], align 8
+; GFX-UNSAFE-FP-PRESERVE-NEXT: [[TMP2:%.*]] = load half, ptr [[A]], align 8
+; GFX-UNSAFE-FP-PRESERVE-NEXT: [[MUL:%.*]] = fmul fast half [[TMP1]], [[TMP2]]
+; GFX-UNSAFE-FP-PRESERVE-NEXT: br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_ELSE:.*]]
+; GFX-UNSAFE-FP-PRESERVE: [[COMMON_RET:.*]]:
+; GFX-UNSAFE-FP-PRESERVE-NEXT: [[COMMON_RET_OP:%.*]] = phi half [ [[MUL]], %[[IF_THEN]] ], [ [[SUB:%.*]], %[[IF_ELSE]] ]
+; GFX-UNSAFE-FP-PRESERVE-NEXT: ret half [[COMMON_RET_OP]]
+; GFX-UNSAFE-FP-PRESERVE: [[IF_THEN]]:
+; GFX-UNSAFE-FP-PRESERVE-NEXT: [[ADD:%.*]] = fadd fast half 0xH3C00, [[MUL]]
+; GFX-UNSAFE-FP-PRESERVE-NEXT: br label %[[COMMON_RET]]
+; GFX-UNSAFE-FP-PRESERVE: [[IF_ELSE]]:
+; GFX-UNSAFE-FP-PRESERVE-NEXT: [[SUB]] = fsub fast half [[MUL]], [[TMP0]]
+; GFX-UNSAFE-FP-PRESERVE-NEXT: br label %[[COMMON_RET]]
+;
+entry:
+ %0 = load half, ptr %y, align 8
+ %cmp = fcmp oeq half %0, 0.000000e+00
+ %1 = load half, ptr %x, align 8
+ br i1 %cmp, label %if.then, label %if.else
+
+if.then: ; preds = %entry
+ %2 = load half, ptr %a, align 8
+ %mul = fmul fast half %1, %2
+ %add = fadd fast half 1.000000e+00, %mul
+ ret half %mul
+
+if.else: ; preds = %entry
+ %3 = load half, ptr %a, align 8
+ %mul1 = fmul fast half %1, %3
+ %sub = fsub fast half %mul1, %0
+ ret half %sub
+}
+
|
d08abf2
to
f0ede10
Compare
f0ede10
to
1eed780
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Diff isn't showing the test changes
I'll update the test once the pre-commit PR lands. |
Should use stacked PRs instead of manually referring to dependent PRs |
b4de8ec
to
1acb8a6
Compare
Right, I'll keep that in mind. Thanks @arsenm . |
1acb8a6
to
4c6b5ee
Compare
Change-Id: I4e515a1ca6c792500ea8a946e17dc6145e0ecedc
4c6b5ee
to
14be803
Compare
Type *Ty) const { | ||
switch (Ty->getScalarSizeInBits()) { | ||
case 16: { | ||
SIModeRegisterDefaults Mode = SIModeRegisterDefaults(F, *Subtarget); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should probably try to defer parsing the attribute as late as possible (i.e. check the features first, and only check this if the mode matters for the subtarget)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Will handle this in a follow-up PR (handling modifiers) soon
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/81/builds/3489 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/16/builds/11743 Here is the relevant piece of the build log for the reference
|
Fixes #108751 . Thanks @Shoreshen for helping out with the test case.