diff --git a/llvm/include/llvm/Transforms/IPO/Attributor.h b/llvm/include/llvm/Transforms/IPO/Attributor.h index e590724a123c4b..5844fb8b0f8938 100644 --- a/llvm/include/llvm/Transforms/IPO/Attributor.h +++ b/llvm/include/llvm/Transforms/IPO/Attributor.h @@ -1453,7 +1453,7 @@ struct AttributorConfig { /// Callback function to determine if an indirect call targets should be made /// direct call targets (with an if-cascade). std::function + Function &AssumedCallee, unsigned NumAssumedCallees)> IndirectCalleeSpecializationCallback = nullptr; /// Helper to update an underlying call graph and to delete functions. @@ -1723,10 +1723,11 @@ struct Attributor { /// Return true if we should specialize the call site \b CB for the potential /// callee \p Fn. bool shouldSpecializeCallSiteForCallee(const AbstractAttribute &AA, - CallBase &CB, Function &Callee) { + CallBase &CB, Function &Callee, + unsigned NumAssumedCallees) { return Configuration.IndirectCalleeSpecializationCallback - ? Configuration.IndirectCalleeSpecializationCallback(*this, AA, - CB, Callee) + ? Configuration.IndirectCalleeSpecializationCallback( + *this, AA, CB, Callee, NumAssumedCallees) : true; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index d84704d4c620ed..ff85c0e44fed14 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -14,6 +14,7 @@ #include "GCNSubtarget.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/Analysis/CycleAnalysis.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/IntrinsicsR600.h" @@ -1045,13 +1046,26 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM, &AAPotentialValues::ID, &AAAMDFlatWorkGroupSize::ID, &AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID, &AACallEdges::ID, &AAPointerInfo::ID, &AAPotentialConstantValues::ID, - &AAUnderlyingObjects::ID, &AAAddressSpace::ID}); + &AAUnderlyingObjects::ID, &AAAddressSpace::ID, &AAIndirectCallInfo::ID, + &AAInstanceInfo::ID}); AttributorConfig AC(CGUpdater); AC.IsClosedWorldModule = Options.IsClosedWorld; AC.Allowed = &Allowed; AC.IsModulePass = true; AC.DefaultInitializeLiveInternals = false; + AC.IndirectCalleeSpecializationCallback = + [&TM](Attributor &A, const AbstractAttribute &AA, CallBase &CB, + Function &Callee, unsigned NumAssumedCallees) { + if (AMDGPU::isEntryFunctionCC(Callee.getCallingConv())) + return false; + // Singleton functions can be specialized. + if (NumAssumedCallees == 1) + return true; + // Otherwise specialize uniform values. + const auto &TTI = TM.getTargetTransformInfo(*CB.getCaller()); + return TTI.isAlwaysUniform(CB.getCalledOperand()); + }; AC.IPOAmendableCB = [](const Function &F) { return F.getCallingConv() == CallingConv::AMDGPU_KERNEL; }; diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp index 910c0aeacc42e0..38b61b6a88357c 100644 --- a/llvm/lib/Transforms/IPO/Attributor.cpp +++ b/llvm/lib/Transforms/IPO/Attributor.cpp @@ -3836,7 +3836,7 @@ static bool runAttributorOnFunctions(InformationCache &InfoCache, if (MaxSpecializationPerCB.getNumOccurrences()) { AC.IndirectCalleeSpecializationCallback = [&](Attributor &, const AbstractAttribute &AA, CallBase &CB, - Function &Callee) { + Function &Callee, unsigned) { if (MaxSpecializationPerCB == 0) return false; auto &Set = IndirectCalleeTrackingMap[&CB]; diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp index 6b8fd0e489dd61..9b75c94d0fae3c 100644 --- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp +++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp @@ -12369,7 +12369,8 @@ struct AAIndirectCallInfoCallSite : public AAIndirectCallInfo { SmallVector SkippedAssumedCallees; SmallVector> NewCalls; for (Function *NewCallee : AssumedCallees) { - if (!A.shouldSpecializeCallSiteForCallee(*this, *CB, *NewCallee)) { + if (!A.shouldSpecializeCallSiteForCallee(*this, *CB, *NewCallee, + AssumedCallees.size())) { SkippedAssumedCallees.push_back(NewCallee); SpecializedForAllCallees = false; continue; diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-accesslist-offsetbins-out-of-sync.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-accesslist-offsetbins-out-of-sync.ll index 18ec3ab64298b7..653ec269faa740 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-accesslist-offsetbins-out-of-sync.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-accesslist-offsetbins-out-of-sync.ll @@ -14,7 +14,8 @@ define internal fastcc void @foo(ptr %kg) { ; CHECK-NEXT: [[NUM_CLOSURE_I26_I:%.*]] = getelementptr i8, ptr [[KG]], i64 276 ; CHECK-NEXT: br label %[[WHILE_COND:.*]] ; CHECK: [[WHILE_COND]]: -; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[KG]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[KG]] to ptr addrspace(5) +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[TMP0]], align 4 ; CHECK-NEXT: [[IDXPROM_I:%.*]] = zext i32 [[TMP1]] to i64 ; CHECK-NEXT: switch i32 0, label %[[SW_BB92:.*]] [ ; CHECK-NEXT: i32 1, label %[[SW_BB92]] @@ -22,18 +23,22 @@ define internal fastcc void @foo(ptr %kg) { ; CHECK-NEXT: ] ; CHECK: [[SUBD_TRIANGLE_PATCH_EXIT_I_I35]]: ; CHECK-NEXT: [[ARRAYIDX_I27_I:%.*]] = getelementptr float, ptr [[KG]], i64 [[IDXPROM_I]] -; CHECK-NEXT: store float 0.000000e+00, ptr [[ARRAYIDX_I27_I]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = addrspacecast ptr [[ARRAYIDX_I27_I]] to ptr addrspace(5) +; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[TMP5]], align 4 ; CHECK-NEXT: br label %[[WHILE_COND]] ; CHECK: [[SW_BB92]]: ; CHECK-NEXT: [[INSERT:%.*]] = insertelement <3 x i32> zeroinitializer, i32 [[TMP1]], i64 0 ; CHECK-NEXT: [[SPLAT_SPLATINSERT_I:%.*]] = bitcast <3 x i32> [[INSERT]] to <3 x float> ; CHECK-NEXT: [[SHFL:%.*]] = shufflevector <3 x float> [[SPLAT_SPLATINSERT_I]], <3 x float> zeroinitializer, <4 x i32> zeroinitializer -; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[NUM_CLOSURE_I26_I]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[NUM_CLOSURE_I26_I]] to ptr addrspace(5) +; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(5) [[TMP2]], align 4 ; CHECK-NEXT: [[IDXPROM_I27_I:%.*]] = sext i32 [[LOAD]] to i64 ; CHECK-NEXT: [[ARRAYIDX_I28_I:%.*]] = getelementptr [64 x %struct.ShaderClosure], ptr [[CLOSURE_I25_I]], i64 0, i64 [[IDXPROM_I27_I]] -; CHECK-NEXT: store <4 x float> [[SHFL]], ptr [[ARRAYIDX_I28_I]], align 16 +; CHECK-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[ARRAYIDX_I28_I]] to ptr addrspace(5) +; CHECK-NEXT: store <4 x float> [[SHFL]], ptr addrspace(5) [[TMP3]], align 16 ; CHECK-NEXT: [[INC_I30_I:%.*]] = or i32 [[LOAD]], 1 -; CHECK-NEXT: store i32 [[INC_I30_I]], ptr [[NUM_CLOSURE_I26_I]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[NUM_CLOSURE_I26_I]] to ptr addrspace(5) +; CHECK-NEXT: store i32 [[INC_I30_I]], ptr addrspace(5) [[TMP4]], align 4 ; CHECK-NEXT: br label %[[WHILE_COND]] ; entry: diff --git a/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll index 386f9cd3f9ce73..37f5564c6e5ffd 100644 --- a/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll @@ -14,8 +14,7 @@ define internal void @direct() { ; CHECK-SAME: () #[[ATTR1:[0-9]+]] { ; CHECK-NEXT: [[FPTR:%.*]] = alloca ptr, align 8, addrspace(5) ; CHECK-NEXT: store ptr @indirect, ptr addrspace(5) [[FPTR]], align 8 -; CHECK-NEXT: [[FP:%.*]] = load ptr, ptr addrspace(5) [[FPTR]], align 8 -; CHECK-NEXT: call void [[FP]]() +; CHECK-NEXT: call void @indirect() ; CHECK-NEXT: ret void ; %fptr = alloca ptr, addrspace(5) @@ -36,5 +35,5 @@ define amdgpu_kernel void @test_direct_indirect_call() { } ;. ; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR1]] = { "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll b/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll index 05558c555c581e..3e9e97755fdf79 100644 --- a/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll +++ b/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll @@ -26,8 +26,7 @@ define amdgpu_kernel void @test_simple_indirect_call() #0 { ; ATTRIBUTOR_GCN-SAME: () #[[ATTR1:[0-9]+]] { ; ATTRIBUTOR_GCN-NEXT: [[FPTR:%.*]] = alloca ptr, align 8, addrspace(5) ; ATTRIBUTOR_GCN-NEXT: store ptr @indirect, ptr addrspace(5) [[FPTR]], align 8 -; ATTRIBUTOR_GCN-NEXT: [[FP:%.*]] = load ptr, ptr addrspace(5) [[FPTR]], align 8 -; ATTRIBUTOR_GCN-NEXT: call void [[FP]]() +; ATTRIBUTOR_GCN-NEXT: call void @indirect() ; ATTRIBUTOR_GCN-NEXT: ret void ; %fptr = alloca ptr, addrspace(5) @@ -43,5 +42,5 @@ attributes #0 = { "amdgpu-no-dispatch-id" } ; AKF_GCN: attributes #[[ATTR0]] = { "amdgpu-calls" "amdgpu-no-dispatch-id" "amdgpu-stack-objects" } ;. ; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-dispatch-id" "uniform-work-group-size"="false" } +; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll index ef37bbf34cf7ba..591a775de122c4 100644 --- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll +++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll @@ -60,8 +60,7 @@ define amdgpu_kernel void @foo(ptr noundef %fp) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[FP_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) ; CHECK-NEXT: store ptr [[FP]], ptr addrspace(5) [[FP_ADDR]], align 8 -; CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr addrspace(5) [[FP_ADDR]], align 8 -; CHECK-NEXT: call void [[LOAD]]() +; CHECK-NEXT: call void [[FP]]() ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll index 3a6b0485d24174..a09b4e683b939a 100644 --- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll @@ -35,8 +35,7 @@ define amdgpu_kernel void @test_simple_indirect_call() { ; ATTRIBUTOR_GCN-SAME: () #[[ATTR1:[0-9]+]] { ; ATTRIBUTOR_GCN-NEXT: [[FPTR:%.*]] = alloca ptr, align 8, addrspace(5) ; ATTRIBUTOR_GCN-NEXT: store ptr @indirect, ptr addrspace(5) [[FPTR]], align 8 -; ATTRIBUTOR_GCN-NEXT: [[FP:%.*]] = load ptr, ptr addrspace(5) [[FPTR]], align 8 -; ATTRIBUTOR_GCN-NEXT: call void [[FP]]() +; ATTRIBUTOR_GCN-NEXT: call void @indirect() ; ATTRIBUTOR_GCN-NEXT: ret void ; ; GFX9-LABEL: test_simple_indirect_call: @@ -81,7 +80,7 @@ define amdgpu_kernel void @test_simple_indirect_call() { ; AKF_GCN: attributes #[[ATTR0]] = { "amdgpu-calls" "amdgpu-stack-objects" } ;. ; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "uniform-work-group-size"="false" } +; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ;. ; AKF_GCN: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500} ;. diff --git a/revert_patches.txt b/revert_patches.txt index 1bb8ce7fec12a8..c7ab446e455b7c 100644 --- a/revert_patches.txt +++ b/revert_patches.txt @@ -75,8 +75,6 @@ dfeb3991fb48 Remove the `x86_mmx` IR type. (#98505) b7e4fba6e5dc Cleanup x86_mmx after removing IR type (#100646) (Reason: dependent on dfeb3991fb48) Ron: still broken 9-6-24 --- -revert: 1ca9fe6db334 Reapply "[Attributor][AMDGPU] Enable AAIndirectCallInfo for AMDAttributor ---- revert : breaks build of amdgpu flat intrinsics ee08d9cba561 AMDGPU: Remove global/flat atomic fadd intrinics (#97051) ---