Skip to content

Commit

Permalink
[SDA] Don't stop divergence propagation at the IPD.
Browse files Browse the repository at this point in the history
Summary:
This fixes B42473 and B42706.

This patch makes the SDA propagate branch divergence until the end of the RPO traversal. Before, the SyncDependenceAnalysis propagated divergence only until the IPD in rpo order. RPO is incompatible with post dominance in the presence of loops. This made the SDA crash because blocks were missed in the propagation.

Reviewers: foad, nhaehnle

Subscribers: jvesely, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D65274

Change-Id: Ic3fcc51e5850fde3a4649ec3dd84a040a07aaca9
  • Loading branch information
simoll authored and Tim Renouf committed Oct 10, 2019
1 parent cef330f commit cc72be2
Show file tree
Hide file tree
Showing 2 changed files with 137 additions and 35 deletions.
61 changes: 26 additions & 35 deletions llvm/lib/Analysis/SyncDependenceAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -218,23 +218,31 @@ struct DivergencePropagator {
template <typename SuccessorIterable>
std::unique_ptr<ConstBlockSet>
computeJoinPoints(const BasicBlock &RootBlock,
SuccessorIterable NodeSuccessors, const Loop *ParentLoop, const BasicBlock * PdBoundBlock) {
SuccessorIterable NodeSuccessors, const Loop *ParentLoop) {
assert(JoinBlocks);

LLVM_DEBUG(dbgs() << "SDA:computeJoinPoints. Parent loop: " << (ParentLoop ? ParentLoop->getName() : "<null>") << "\n" );

// bootstrap with branch targets
for (const auto *SuccBlock : NodeSuccessors) {
DefMap.emplace(SuccBlock, SuccBlock);

if (ParentLoop && !ParentLoop->contains(SuccBlock)) {
// immediate loop exit from node.
ReachedLoopExits.insert(SuccBlock);
continue;
} else {
// regular successor
PendingUpdates.insert(SuccBlock);
}
}

LLVM_DEBUG(
dbgs() << "SDA: rpo order:\n";
for (const auto * RpoBlock : FuncRPOT) {
dbgs() << "- " << RpoBlock->getName() << "\n";
}
);

auto ItBeginRPO = FuncRPOT.begin();

// skip until term (TODO RPOT won't let us start at @term directly)
Expand All @@ -245,16 +253,18 @@ struct DivergencePropagator {

// propagate definitions at the immediate successors of the node in RPO
auto ItBlockRPO = ItBeginRPO;
while (++ItBlockRPO != ItEndRPO && *ItBlockRPO != PdBoundBlock) {
while ((++ItBlockRPO != ItEndRPO) &&
!PendingUpdates.empty()) {
const auto *Block = *ItBlockRPO;
LLVM_DEBUG(dbgs() << "SDA::joins. visiting " << Block->getName() << "\n");

// skip @block if not pending update
// skip Block if not pending update
auto ItPending = PendingUpdates.find(Block);
if (ItPending == PendingUpdates.end())
continue;
PendingUpdates.erase(ItPending);

// propagate definition at @block to its successors
// propagate definition at Block to its successors
auto ItDef = DefMap.find(Block);
const auto *DefBlock = ItDef->second;
assert(DefBlock);
Expand All @@ -278,6 +288,8 @@ struct DivergencePropagator {
}
}

LLVM_DEBUG(dbgs() << "SDA::joins. After propagation:\n"; printDefs(dbgs()));

// We need to know the definition at the parent loop header to decide
// whether the definition at the header is different from the definition at
// the loop exits, which would indicate a divergent loop exits.
Expand All @@ -292,24 +304,17 @@ struct DivergencePropagator {
// |
// proper exit from both loops
//
// D post-dominates B as it is the only proper exit from the "A loop".
// If C has a divergent branch, propagation will therefore stop at D.
// That implies that B will never receive a definition.
// But that definition can only be the same as at D (D itself in thise case)
// because all paths to anywhere have to pass through D.
//
const BasicBlock *ParentLoopHeader =
ParentLoop ? ParentLoop->getHeader() : nullptr;
if (ParentLoop && ParentLoop->contains(PdBoundBlock)) {
DefMap[ParentLoopHeader] = DefMap[PdBoundBlock];
}

// analyze reached loop exits
if (!ReachedLoopExits.empty()) {
const BasicBlock *ParentLoopHeader =
ParentLoop ? ParentLoop->getHeader() : nullptr;

assert(ParentLoop);
const auto *HeaderDefBlock = DefMap[ParentLoopHeader];
auto ItHeaderDef = DefMap.find(ParentLoopHeader);
const auto *HeaderDefBlock = (ItHeaderDef == DefMap.end()) ? nullptr : ItHeaderDef->second;

LLVM_DEBUG(printDefs(dbgs()));
assert(HeaderDefBlock && "no definition in header of carrying loop");
assert(HeaderDefBlock && "no definition at header of carrying loop");

for (const auto *ExitBlock : ReachedLoopExits) {
auto ItExitDef = DefMap.find(ExitBlock);
Expand Down Expand Up @@ -339,19 +344,10 @@ const ConstBlockSet &SyncDependenceAnalysis::join_blocks(const Loop &Loop) {
return *ItCached->second;
}

// dont propagte beyond the immediate post dom of the loop
const auto *PdNode = PDT.getNode(const_cast<BasicBlock *>(Loop.getHeader()));
const auto *IpdNode = PdNode->getIDom();
const auto *PdBoundBlock = IpdNode ? IpdNode->getBlock() : nullptr;
while (PdBoundBlock && Loop.contains(PdBoundBlock)) {
IpdNode = IpdNode->getIDom();
PdBoundBlock = IpdNode ? IpdNode->getBlock() : nullptr;
}

// compute all join points
DivergencePropagator Propagator{FuncRPOT, DT, PDT, LI};
auto JoinBlocks = Propagator.computeJoinPoints<const LoopExitVec &>(
*Loop.getHeader(), LoopExits, Loop.getParentLoop(), PdBoundBlock);
*Loop.getHeader(), LoopExits, Loop.getParentLoop());

auto ItInserted = CachedLoopExitJoins.emplace(&Loop, std::move(JoinBlocks));
assert(ItInserted.second);
Expand All @@ -370,16 +366,11 @@ SyncDependenceAnalysis::join_blocks(const Instruction &Term) {
if (ItCached != CachedBranchJoins.end())
return *ItCached->second;

// dont propagate beyond the immediate post dominator of the branch
const auto *PdNode = PDT.getNode(const_cast<BasicBlock *>(Term.getParent()));
const auto *IpdNode = PdNode->getIDom();
const auto *PdBoundBlock = IpdNode ? IpdNode->getBlock() : nullptr;

// compute all join points
DivergencePropagator Propagator{FuncRPOT, DT, PDT, LI};
const auto &TermBlock = *Term.getParent();
auto JoinBlocks = Propagator.computeJoinPoints<succ_const_range>(
TermBlock, successors(Term.getParent()), LI.getLoopFor(&TermBlock), PdBoundBlock);
TermBlock, successors(Term.getParent()), LI.getLoopFor(&TermBlock));

auto ItInserted = CachedBranchJoins.emplace(&Term, std::move(JoinBlocks));
assert(ItInserted.second);
Expand Down
111 changes: 111 additions & 0 deletions llvm/test/Analysis/DivergenceAnalysis/AMDGPU/b42473-r1-crash.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
; RUN: opt -mtriple amdgcn-unknown-amdhsa -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s

declare i32 @gf2(i32)
declare i32 @gf1(i32)

define void @tw1(i32 addrspace(4)* noalias nocapture readonly %A, i32 addrspace(4)* noalias nocapture %B) local_unnamed_addr #2 {
; CHECK: Printing analysis 'Legacy Divergence Analysis' for function 'tw1':
; CHECK: DIVERGENT: i32 addrspace(4)* %A
; CHECK: DIVERGENT: i32 addrspace(4)* %B
entry:
; CHECK: DIVERGENT: %call = tail call i32 @gf2(i32 0) #0
; CHECK: DIVERGENT: %cmp = icmp ult i32 %call, 16
; CHECK: DIVERGENT: br i1 %cmp, label %if.then, label %new_exit
%call = tail call i32 @gf2(i32 0) #3
%cmp = icmp ult i32 %call, 16
br i1 %cmp, label %if.then, label %new_exit

if.then:
; CHECK: DIVERGENT: %call1 = tail call i32 @gf1(i32 0) #0
; CHECK: DIVERGENT: %arrayidx = getelementptr inbounds i32, i32 addrspace(4)* %A, i32 %call1
; CHECK: DIVERGENT: %0 = load i32, i32 addrspace(4)* %arrayidx, align 4
; CHECK: DIVERGENT: %cmp225 = icmp sgt i32 %0, 0
; CHECK: DIVERGENT: %arrayidx10 = getelementptr inbounds i32, i32 addrspace(4)* %B, i32 %call1
; CHECK: DIVERGENT: br i1 %cmp225, label %while.body.preheader, label %if.then.while.end_crit_edge
%call1 = tail call i32 @gf1(i32 0) #4
%arrayidx = getelementptr inbounds i32, i32 addrspace(4)* %A, i32 %call1
%0 = load i32, i32 addrspace(4)* %arrayidx, align 4
%cmp225 = icmp sgt i32 %0, 0
%arrayidx10 = getelementptr inbounds i32, i32 addrspace(4)* %B, i32 %call1
br i1 %cmp225, label %while.body.preheader, label %if.then.while.end_crit_edge

while.body.preheader:
br label %while.body

if.then.while.end_crit_edge:
; CHECK: DIVERGENT: %.pre = load i32, i32 addrspace(4)* %arrayidx10, align 4
%.pre = load i32, i32 addrspace(4)* %arrayidx10, align 4
br label %while.end

while.body:
; CHECK-NOT: DIVERGENT: %i.026 = phi i32 [ %inc, %if.end.while.body_crit_edge ], [ 0, %while.body.preheader ]
; CHECK: DIVERGENT: %call3 = tail call i32 @gf1(i32 0) #0
; CHECK: DIVERGENT: %cmp4 = icmp ult i32 %call3, 10
; CHECK: DIVERGENT: %arrayidx6 = getelementptr inbounds i32, i32 addrspace(4)* %A, i32 %i.026
; CHECK: DIVERGENT: %1 = load i32, i32 addrspace(4)* %arrayidx6, align 4
; CHECK: DIVERGENT: br i1 %cmp4, label %if.then5, label %if.else
%i.026 = phi i32 [ %inc, %if.end.while.body_crit_edge ], [ 0, %while.body.preheader ]
%call3 = tail call i32 @gf1(i32 0) #4
%cmp4 = icmp ult i32 %call3, 10
%arrayidx6 = getelementptr inbounds i32, i32 addrspace(4)* %A, i32 %i.026
%1 = load i32, i32 addrspace(4)* %arrayidx6, align 4
br i1 %cmp4, label %if.then5, label %if.else

if.then5:
; CHECK: DIVERGENT: %mul = shl i32 %1, 1
; CHECK: DIVERGENT: %2 = load i32, i32 addrspace(4)* %arrayidx10, align 4
; CHECK: DIVERGENT: %add = add nsw i32 %2, %mul
%mul = shl i32 %1, 1
%2 = load i32, i32 addrspace(4)* %arrayidx10, align 4
%add = add nsw i32 %2, %mul
br label %if.end

if.else:
; CHECK: DIVERGENT: %mul9 = shl i32 %1, 2
; CHECK: DIVERGENT: %3 = load i32, i32 addrspace(4)* %arrayidx10, align 4
; CHECK: DIVERGENT: %add11 = add nsw i32 %3, %mul9
%mul9 = shl i32 %1, 2
%3 = load i32, i32 addrspace(4)* %arrayidx10, align 4
%add11 = add nsw i32 %3, %mul9
br label %if.end

if.end:
; CHECK: DIVERGENT: %storemerge = phi i32 [ %add11, %if.else ], [ %add, %if.then5 ]
; CHECK: DIVERGENT: store i32 %storemerge, i32 addrspace(4)* %arrayidx10, align 4
; CHECK-NOT: DIVERGENT: %inc = add nuw nsw i32 %i.026, 1
; CHECK: DIVERGENT: %exitcond = icmp ne i32 %inc, %0
; CHECK: DIVERGENT: br i1 %exitcond, label %if.end.while.body_crit_edge, label %while.end.loopexit
%storemerge = phi i32 [ %add11, %if.else ], [ %add, %if.then5 ]
store i32 %storemerge, i32 addrspace(4)* %arrayidx10, align 4
%inc = add nuw nsw i32 %i.026, 1
%exitcond = icmp ne i32 %inc, %0
br i1 %exitcond, label %if.end.while.body_crit_edge, label %while.end.loopexit

if.end.while.body_crit_edge:
br label %while.body

while.end.loopexit:
; CHECK: DIVERGENT: %storemerge.lcssa = phi i32 [ %storemerge, %if.end ]
%storemerge.lcssa = phi i32 [ %storemerge, %if.end ]
br label %while.end

while.end:
; CHECK: DIVERGENT: %4 = phi i32 [ %.pre, %if.then.while.end_crit_edge ], [ %storemerge.lcssa, %while.end.loopexit ]
; CHECK: DIVERGENT: %i.0.lcssa = phi i32 [ 0, %if.then.while.end_crit_edge ], [ %0, %while.end.loopexit ]
; CHECK: DIVERGENT: %sub = sub nsw i32 %4, %i.0.lcssa
; CHECK: DIVERGENT: store i32 %sub, i32 addrspace(4)* %arrayidx10, align 4
%4 = phi i32 [ %.pre, %if.then.while.end_crit_edge ], [ %storemerge.lcssa, %while.end.loopexit ]
%i.0.lcssa = phi i32 [ 0, %if.then.while.end_crit_edge ], [ %0, %while.end.loopexit ]
%sub = sub nsw i32 %4, %i.0.lcssa
store i32 %sub, i32 addrspace(4)* %arrayidx10, align 4
br label %new_exit

new_exit:
ret void
}

attributes #0 = { nounwind readnone }
attributes #1 = { nounwind readnone }
attributes #2 = { nounwind readnone }
attributes #3 = { nounwind readnone }
attributes #4 = { nounwind readnone }

0 comments on commit cc72be2

Please sign in to comment.