Commit d68c17bc authored by Hans Wennborg's avatar Hans Wennborg
Browse files

Merging r321751, r321806, and r321878:

------------------------------------------------------------------------
r321751 | arsenm | 2018-01-03 10:45:37 -0800 (Wed, 03 Jan 2018) | 25 lines

StructurizeCFG: Fix broken backedge detection

The work order was changed in r228186 from SCC order
to RPO with an arbitrary sorting function. The sorting
function attempted to move inner loop nodes earlier. This
was was apparently relying on an assumption that every block
in a given loop / the same loop depth would be seen before
visiting another loop. In the broken testcase, a block
outside of the loop was encountered before moving onto
another block in the same loop. The testcase would then
structurize such that one blocks unconditional successor
could never be reached.

Revert to plain RPO for the analysis phase. This fixes
detecting edges as backedges that aren't really.

The processing phase does use another visited set, and
I'm unclear on whether the order there is as important.
An arbitrary order doesn't work, and triggers some infinite
loops. The reversed RPO list seems to work and is closer
to the order that was used before, minus the arbitary
custom sorting.

A few of the changed tests now produce smaller code,
and a few are slightly worse looking.
------------------------------------------------------------------------

------------------------------------------------------------------------
r321806 | arsenm | 2018-01-04 09:23:24 -0800 (Thu, 04 Jan 2018) | 4 lines

StructurizeCFG: xfail one of the testcases from r321751

It fails with -verify-region-info. This seems to be a issue
with RegionInfo itself which existed before.
------------------------------------------------------------------------

------------------------------------------------------------------------
r321878 | arsenm | 2018-01-05 09:51:36 -0800 (Fri, 05 Jan 2018) | 4 lines

RegionInfo: Use report_fatal_error instead of llvm_unreachable

Otherwise when using -verify-region-info in a release build the
error won't be emitted.
------------------------------------------------------------------------

llvm-svn: 322686
parent b539787a
Loading
Loading
Loading
Loading
+6 −6
Original line number Diff line number Diff line
@@ -254,14 +254,14 @@ std::string RegionBase<Tr>::getNameStr() const {
template <class Tr>
void RegionBase<Tr>::verifyBBInRegion(BlockT *BB) const {
  if (!contains(BB))
    llvm_unreachable("Broken region found: enumerated BB not in region!");
    report_fatal_error("Broken region found: enumerated BB not in region!");

  BlockT *entry = getEntry(), *exit = getExit();

  for (BlockT *Succ :
       make_range(BlockTraits::child_begin(BB), BlockTraits::child_end(BB))) {
    if (!contains(Succ) && exit != Succ)
      llvm_unreachable("Broken region found: edges leaving the region must go "
      report_fatal_error("Broken region found: edges leaving the region must go "
                         "to the exit node!");
  }

@@ -269,7 +269,7 @@ void RegionBase<Tr>::verifyBBInRegion(BlockT *BB) const {
    for (BlockT *Pred : make_range(InvBlockTraits::child_begin(BB),
                                   InvBlockTraits::child_end(BB))) {
      if (!contains(Pred))
        llvm_unreachable("Broken region found: edges entering the region must "
        report_fatal_error("Broken region found: edges entering the region must "
                           "go to the entry node!");
    }
  }
@@ -557,7 +557,7 @@ void RegionInfoBase<Tr>::verifyBBMap(const RegionT *R) const {
    } else {
      BlockT *BB = Element->template getNodeAs<BlockT>();
      if (getRegionFor(BB) != R)
        llvm_unreachable("BB map does not match region nesting");
        report_fatal_error("BB map does not match region nesting");
    }
  }
}
+28 −82
Original line number Diff line number Diff line
@@ -14,7 +14,6 @@
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/Analysis/DivergenceAnalysis.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/RegionInfo.h"
#include "llvm/Analysis/RegionIterator.h"
#include "llvm/Analysis/RegionPass.h"
@@ -177,9 +176,8 @@ class StructurizeCFG : public RegionPass {
  Region *ParentRegion;

  DominatorTree *DT;
  LoopInfo *LI;

  SmallVector<RegionNode *, 8> Order;
  std::deque<RegionNode *> Order;
  BBSet Visited;

  BBPhiMap DeletedPhis;
@@ -204,7 +202,7 @@ class StructurizeCFG : public RegionPass {

  void gatherPredicates(RegionNode *N);

  void collectInfos();
  void analyzeNode(RegionNode *N);

  void insertConditions(bool Loops);

@@ -258,7 +256,6 @@ public:
      AU.addRequired<DivergenceAnalysis>();
    AU.addRequiredID(LowerSwitchID);
    AU.addRequired<DominatorTreeWrapperPass>();
    AU.addRequired<LoopInfoWrapperPass>();

    AU.addPreserved<DominatorTreeWrapperPass>();
    RegionPass::getAnalysisUsage(AU);
@@ -292,56 +289,18 @@ bool StructurizeCFG::doInitialization(Region *R, RGPassManager &RGM) {

/// \brief Build up the general order of nodes
void StructurizeCFG::orderNodes() {
  ReversePostOrderTraversal<Region*> RPOT(ParentRegion);
  SmallDenseMap<Loop*, unsigned, 8> LoopBlocks;
  assert(Visited.empty());
  assert(Predicates.empty());
  assert(Loops.empty());
  assert(LoopPreds.empty());

  // The reverse post-order traversal of the list gives us an ordering close
  // to what we want.  The only problem with it is that sometimes backedges
  // for outer loops will be visited before backedges for inner loops.
  for (RegionNode *RN : RPOT) {
    BasicBlock *BB = RN->getEntry();
    Loop *Loop = LI->getLoopFor(BB);
    ++LoopBlocks[Loop];
  }

  unsigned CurrentLoopDepth = 0;
  Loop *CurrentLoop = nullptr;
  for (auto I = RPOT.begin(), E = RPOT.end(); I != E; ++I) {
    BasicBlock *BB = (*I)->getEntry();
    unsigned LoopDepth = LI->getLoopDepth(BB);

    if (is_contained(Order, *I))
      continue;

    if (LoopDepth < CurrentLoopDepth) {
      // Make sure we have visited all blocks in this loop before moving back to
      // the outer loop.

      auto LoopI = I;
      while (unsigned &BlockCount = LoopBlocks[CurrentLoop]) {
        LoopI++;
        BasicBlock *LoopBB = (*LoopI)->getEntry();
        if (LI->getLoopFor(LoopBB) == CurrentLoop) {
          --BlockCount;
          Order.push_back(*LoopI);
  // This must be RPO order for the back edge detection to work
  for (RegionNode *RN : ReversePostOrderTraversal<Region*>(ParentRegion)) {
    // FIXME: Is there a better order to use for structurization?
    Order.push_back(RN);
    analyzeNode(RN);
  }
}
    }

    CurrentLoop = LI->getLoopFor(BB);
    if (CurrentLoop)
      LoopBlocks[CurrentLoop]--;

    CurrentLoopDepth = LoopDepth;
    Order.push_back(*I);
  }

  // This pass originally used a post-order traversal and then operated on
  // the list in reverse. Now that we are using a reverse post-order traversal
  // rather than re-working the whole pass to operate on the list in order,
  // we just reverse the list and continue to operate on it in reverse.
  std::reverse(Order.begin(), Order.end());
}

/// \brief Determine the end of the loops
void StructurizeCFG::analyzeLoops(RegionNode *N) {
@@ -466,22 +425,10 @@ void StructurizeCFG::gatherPredicates(RegionNode *N) {
}

/// \brief Collect various loop and predicate infos
void StructurizeCFG::collectInfos() {
  // Reset predicate
  Predicates.clear();

  // and loop infos
  Loops.clear();
  LoopPreds.clear();

  // Reset the visited nodes
  Visited.clear();

  for (RegionNode *RN : reverse(Order)) {
void StructurizeCFG::analyzeNode(RegionNode *RN) {
  DEBUG(dbgs() << "Visiting: "
        << (RN->isSubRegion() ? "SubRegion with entry: " : "")
                 << RN->getEntry()->getName() << " Loop Depth: "
                 << LI->getLoopDepth(RN->getEntry()) << "\n");
        << RN->getEntry()->getName() << '\n');

  // Analyze all the conditions leading to a node
  gatherPredicates(RN);
@@ -492,7 +439,6 @@ void StructurizeCFG::collectInfos() {
  // Find the last back edges
  analyzeLoops(RN);
}
}

/// \brief Insert the missing branch conditions
void StructurizeCFG::insertConditions(bool Loops) {
@@ -664,7 +610,7 @@ void StructurizeCFG::changeExit(RegionNode *Node, BasicBlock *NewExit,
BasicBlock *StructurizeCFG::getNextFlow(BasicBlock *Dominator) {
  LLVMContext &Context = Func->getContext();
  BasicBlock *Insert = Order.empty() ? ParentRegion->getExit() :
                       Order.back()->getEntry();
                       Order.front()->getEntry();
  BasicBlock *Flow = BasicBlock::Create(Context, FlowBlockName,
                                        Func, Insert);
  DT->addNewBlock(Flow, Dominator);
@@ -744,7 +690,8 @@ bool StructurizeCFG::isPredictableTrue(RegionNode *Node) {
/// Take one node from the order vector and wire it up
void StructurizeCFG::wireFlow(bool ExitUseAllowed,
                              BasicBlock *LoopEnd) {
  RegionNode *Node = Order.pop_back_val();
  RegionNode *Node = Order.front();
  Order.pop_front();
  Visited.insert(Node->getEntry());

  if (isPredictableTrue(Node)) {
@@ -768,7 +715,7 @@ void StructurizeCFG::wireFlow(bool ExitUseAllowed,

    PrevNode = Node;
    while (!Order.empty() && !Visited.count(LoopEnd) &&
           dominatesPredicates(Entry, Order.back())) {
           dominatesPredicates(Entry, Order.front())) {
      handleLoops(false, LoopEnd);
    }

@@ -779,7 +726,7 @@ void StructurizeCFG::wireFlow(bool ExitUseAllowed,

void StructurizeCFG::handleLoops(bool ExitUseAllowed,
                                 BasicBlock *LoopEnd) {
  RegionNode *Node = Order.back();
  RegionNode *Node = Order.front();
  BasicBlock *LoopStart = Node->getEntry();

  if (!Loops.count(LoopStart)) {
@@ -924,10 +871,9 @@ bool StructurizeCFG::runOnRegion(Region *R, RGPassManager &RGM) {
  ParentRegion = R;

  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();

  orderNodes();
  collectInfos();

  createFlow();
  insertConditions(false);
  insertConditions(true);
+2 −1
Original line number Diff line number Diff line
@@ -66,9 +66,10 @@ ENDIF: ; preds = %LOOP

; OPT-LABEL: define amdgpu_kernel void @multi_if_break_loop(
; OPT: llvm.amdgcn.break
; OPT: llvm.amdgcn.loop
; OPT: llvm.amdgcn.break
; OPT: llvm.amdgcn.if.break
; OPT: llvm.amdgcn.if.break
; OPT: llvm.amdgcn.loop
; OPT: llvm.amdgcn.end.cf

; GCN-LABEL: {{^}}multi_if_break_loop:
+86 −41
Original line number Diff line number Diff line
@@ -124,55 +124,100 @@ bb23: ; preds = %bb10
; Earlier version of above, before a run of the structurizer.
; IR-LABEL: @nested_loop_conditions(

; IR: Flow7:
; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %17)
; IR-NEXT: %0 = call { i1, i64 } @llvm.amdgcn.if(i1 %15)
; IR-NEXT: %1 = extractvalue { i1, i64 } %0, 0
; IR-NEXT: %2 = extractvalue { i1, i64 } %0, 1
; IR-NEXT: br i1 %1, label %bb4.bb13_crit_edge, label %Flow8
; IR: %tmp1235 = icmp slt i32 %tmp1134, 9
; IR:   br i1 %tmp1235, label %bb14.lr.ph, label %Flow

; IR: bb14.lr.ph:
; IR: br label %bb14

; IR: Flow3:
; IR:   call void @llvm.amdgcn.end.cf(i64 %18)
; IR:   %0 = call { i1, i64 } @llvm.amdgcn.if(i1 %17)
; IR:   %1 = extractvalue { i1, i64 } %0, 0
; IR:   %2 = extractvalue { i1, i64 } %0, 1
; IR:   br i1 %1, label %bb4.bb13_crit_edge, label %Flow4

; IR: bb4.bb13_crit_edge:
; IR:   br label %Flow4

; IR: Flow4:
; IR:   %3 = phi i1 [ true, %bb4.bb13_crit_edge ], [ false, %Flow3 ]
; IR:   call void @llvm.amdgcn.end.cf(i64 %2)
; IR:   br label %Flow

; IR: bb13:
; IR:   br label %bb31

; IR: Flow:
; IR:   %4 = phi i1 [ %3, %Flow4 ], [ true, %bb ]
; IR:   %5 = call { i1, i64 } @llvm.amdgcn.if(i1 %4)
; IR:   %6 = extractvalue { i1, i64 } %5, 0
; IR:   %7 = extractvalue { i1, i64 } %5, 1
; IR:   br i1 %6, label %bb13, label %bb31

; IR: bb14:
; IR:   %phi.broken = phi i64 [ %18, %Flow2 ], [ 0, %bb14.lr.ph ]
; IR:   %tmp1037 = phi i32 [ %tmp1033, %bb14.lr.ph ], [ %16, %Flow2 ]
; IR:   %tmp936 = phi <4 x i32> [ %tmp932, %bb14.lr.ph ], [ %15, %Flow2 ]
; IR:   %tmp15 = icmp eq i32 %tmp1037, 1
; IR:   %8 = xor i1 %tmp15, true
; IR:   %9 = call { i1, i64 } @llvm.amdgcn.if(i1 %8)
; IR:   %10 = extractvalue { i1, i64 } %9, 0
; IR:   %11 = extractvalue { i1, i64 } %9, 1
; IR:   br i1 %10, label %bb31.loopexit, label %Flow1

; IR: Flow1:
; IR-NEXT: %loop.phi = phi i64 [ %loop.phi9, %Flow6 ], [ %phi.broken, %bb14 ]
; IR-NEXT: %13 = phi <4 x i32> [ %29, %Flow6 ], [ undef, %bb14 ]
; IR-NEXT: %14 = phi i32 [ %30, %Flow6 ], [ undef, %bb14 ]
; IR-NEXT: %15 = phi i1 [ %31, %Flow6 ], [ false, %bb14 ]
; IR-NEXT: %16 = phi i1 [ false, %Flow6 ], [ %8, %bb14 ]
; IR-NEXT: %17 = call i64 @llvm.amdgcn.else.break(i64 %11, i64 %loop.phi)
; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %11)
; IR-NEXT: %18 = call i1 @llvm.amdgcn.loop(i64 %17)
; IR-NEXT: br i1 %18, label %Flow7, label %bb14
; IR:   %12 = call { i1, i64 } @llvm.amdgcn.else(i64 %11)
; IR:   %13 = extractvalue { i1, i64 } %12, 0
; IR:   %14 = extractvalue { i1, i64 } %12, 1
; IR:   br i1 %13, label %bb16, label %Flow2

; IR: bb16:
; IR:   %tmp17 = bitcast i64 %tmp3 to <2 x i32>
; IR:   br label %bb18

; IR: Flow2:
; IR-NEXT: %loop.phi10 = phi i64 [ %loop.phi11, %Flow5 ], [ %12, %bb16 ]
; IR-NEXT: %19 = phi <4 x i32> [ %29, %Flow5 ], [ undef, %bb16 ]
; IR-NEXT: %20 = phi i32 [ %30, %Flow5 ], [ undef, %bb16 ]
; IR-NEXT: %21 = phi i1 [ %31, %Flow5 ], [ false, %bb16 ]
; IR-NEXT: %22 = phi i1 [ false, %Flow5 ], [ false, %bb16 ]
; IR-NEXT: %23 = phi i1 [ false, %Flow5 ], [ %8, %bb16 ]
; IR-NEXT: %24 = call { i1, i64 } @llvm.amdgcn.if(i1 %23)
; IR-NEXT: %25 = extractvalue { i1, i64 } %24, 0
; IR-NEXT: %26 = extractvalue { i1, i64 } %24, 1
; IR-NEXT: br i1 %25, label %bb21, label %Flow3
; IR:   %loop.phi = phi i64 [ %21, %bb21 ], [ %phi.broken, %Flow1 ]
; IR:   %15 = phi <4 x i32> [ %tmp9, %bb21 ], [ undef, %Flow1 ]
; IR:   %16 = phi i32 [ %tmp10, %bb21 ], [ undef, %Flow1 ]
; IR:   %17 = phi i1 [ %20, %bb21 ], [ false, %Flow1 ]
; IR:   %18 = call i64 @llvm.amdgcn.else.break(i64 %14, i64 %loop.phi)
; IR:   call void @llvm.amdgcn.end.cf(i64 %14)
; IR:   %19 = call i1 @llvm.amdgcn.loop(i64 %18)
; IR:   br i1 %19, label %Flow3, label %bb14

; IR: bb18:
; IR:   %tmp19 = load volatile i32, i32 addrspace(1)* undef
; IR:   %tmp20 = icmp slt i32 %tmp19, 9
; IR:   br i1 %tmp20, label %bb21, label %bb18

; IR: bb21:
; IR:   %tmp22 = extractelement <2 x i32> %tmp17, i64 1
; IR:   %tmp23 = lshr i32 %tmp22, 16
; IR:   %tmp24 = select i1 undef, i32 undef, i32 %tmp23
; IR:   %tmp25 = uitofp i32 %tmp24 to float
; IR:   %tmp26 = fmul float %tmp25, 0x3EF0001000000000
; IR:   %tmp27 = fsub float %tmp26, undef
; IR:   %tmp28 = fcmp olt float %tmp27, 5.000000e-01
; IR:   %tmp29 = select i1 %tmp28, i64 1, i64 2
; IR:   %tmp30 = extractelement <4 x i32> %tmp936, i64 %tmp29
; IR:   %tmp7 = zext i32 %tmp30 to i64
; IR:   %tmp8 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* undef, i64 %tmp7
; IR:   %tmp9 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp8, align 16
; IR:   %tmp10 = extractelement <4 x i32> %tmp9, i64 0
; IR:   %tmp11 = load volatile i32, i32 addrspace(1)* undef
; IR:   %tmp12 = icmp slt i32 %tmp11, 9
; IR-NEXT: %27 = xor i1 %tmp12, true
; IR-NEXT: %28 = call i64 @llvm.amdgcn.if.break(i1 %27, i64 %phi.broken)
; IR-NEXT: br label %Flow3
; IR:   %20 = xor i1 %tmp12, true
; IR:   %21 = call i64 @llvm.amdgcn.if.break(i1 %20, i64 %phi.broken)
; IR:   br label %Flow2

; IR: Flow3:
; IR-NEXT: %loop.phi11 = phi i64 [ %phi.broken, %bb21 ], [ %phi.broken, %Flow2 ]
; IR-NEXT: %loop.phi9 = phi i64 [ %28, %bb21 ], [ %loop.phi10, %Flow2 ]
; IR-NEXT: %29 = phi <4 x i32> [ %tmp9, %bb21 ], [ %19, %Flow2 ]
; IR-NEXT: %30 = phi i32 [ %tmp10, %bb21 ], [ %20, %Flow2 ]
; IR-NEXT: %31 = phi i1 [ %27, %bb21 ], [ %21, %Flow2 ]
; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %26)
; IR-NEXT: br i1 %22, label %bb31.loopexit, label %Flow4
; IR: bb31.loopexit:
; IR:   br label %Flow1

; IR: bb31:
; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %7)
; IR-NEXT: store volatile i32 0, i32 addrspace(1)* undef
; IR-NEXT: ret void
; IR:   call void @llvm.amdgcn.end.cf(i64 %7)
; IR:   store volatile i32 0, i32 addrspace(1)* undef
; IR:   ret void


; GCN-LABEL: {{^}}nested_loop_conditions:
+77 −0
Original line number Diff line number Diff line
; XFAIL: *
; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -structurizecfg -verify-region-info %s

; FIXME: Merge into backedge-id-bug
; Variant which has an issue with region construction

define amdgpu_kernel void @loop_backedge_misidentified_alt(i32 addrspace(1)* %arg0) #0 {
entry:
  %tmp = load volatile <2 x i32>, <2 x i32> addrspace(1)* undef, align 16
  %load1 = load volatile <2 x float>, <2 x float> addrspace(1)* undef
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i32 %tid
  %i.initial = load volatile i32, i32 addrspace(1)* %gep, align 4
  br label %LOOP.HEADER

LOOP.HEADER:
  %i = phi i32 [ %i.final, %END_ELSE_BLOCK ], [ %i.initial, %entry ]
  call void asm sideeffect "s_nop 0x100b ; loop $0 ", "r,~{memory}"(i32 %i) #0
  %tmp12 = zext i32 %i to i64
  %tmp13 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* null, i64 %tmp12
  %tmp14 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp13, align 16
  %tmp15 = extractelement <4 x i32> %tmp14, i64 0
  %tmp16 = and i32 %tmp15, 65535
  %tmp17 = icmp eq i32 %tmp16, 1
  br i1 %tmp17, label %bb18, label %bb62

bb18:
  %tmp19 = extractelement <2 x i32> %tmp, i64 0
  %tmp22 = lshr i32 %tmp19, 16
  %tmp24 = urem i32 %tmp22, 52
  %tmp25 = mul nuw nsw i32 %tmp24, 52
  br label %INNER_LOOP

INNER_LOOP:
  %inner.loop.j = phi i32 [ %tmp25, %bb18 ], [ %inner.loop.j.inc, %INNER_LOOP ]
  call void asm sideeffect "; inner loop body", ""() #0
  %inner.loop.j.inc = add nsw i32 %inner.loop.j, 1
  %inner.loop.cmp = icmp eq i32 %inner.loop.j, 0
  br i1 %inner.loop.cmp, label %INNER_LOOP_BREAK, label %INNER_LOOP

INNER_LOOP_BREAK:
  %tmp59 = extractelement <4 x i32> %tmp14, i64 2
  call void asm sideeffect "s_nop 23 ", "~{memory}"() #0
  br label %END_ELSE_BLOCK

bb62:
  %load13 = icmp ult i32 %tmp16, 271
  ;br i1 %load13, label %bb64, label %INCREMENT_I
  ; branching directly to the return avoids the bug
  br i1 %load13, label %RETURN, label %INCREMENT_I


bb64:
  call void asm sideeffect "s_nop 42", "~{memory}"() #0
  br label %RETURN

INCREMENT_I:
  %inc.i = add i32 %i, 1
  call void asm sideeffect "s_nop 0x1336 ; increment $0", "v,~{memory}"(i32 %inc.i) #0
  br label %END_ELSE_BLOCK

END_ELSE_BLOCK:
  %i.final = phi i32 [ %tmp59, %INNER_LOOP_BREAK ], [ %inc.i, %INCREMENT_I ]
  call void asm sideeffect "s_nop 0x1337 ; end else block $0", "v,~{memory}"(i32 %i.final) #0
  %cmp.end.else.block = icmp eq i32 %i.final, -1
  br i1 %cmp.end.else.block, label %RETURN, label %LOOP.HEADER

RETURN:
  call void asm sideeffect "s_nop 0x99 ; ClosureEval return", "~{memory}"() #0
  store volatile <2 x float> %load1, <2 x float> addrspace(1)* undef, align 8
  ret void
}

declare i32 @llvm.amdgcn.workitem.id.x() #1

attributes #0 = { convergent nounwind }
attributes #1 = { convergent nounwind readnone }
Loading