Merging r321751, r321806, and r321878: (d68c17bc) · Commits · llvm-doe / llvm-project

llvm/include/llvm/Analysis/RegionInfoImpl.h

+6 −6

Original line number	Diff line number	Diff line
		@@ -254,14 +254,14 @@ std::string RegionBase<Tr>::getNameStr() const {
		template <class Tr>
		void RegionBase<Tr>::verifyBBInRegion(BlockT *BB) const {
		if (!contains(BB))
		llvm_unreachable("Broken region found: enumerated BB not in region!");
		report_fatal_error("Broken region found: enumerated BB not in region!");

		BlockT entry = getEntry(), exit = getExit();

		for (BlockT *Succ :
		make_range(BlockTraits::child_begin(BB), BlockTraits::child_end(BB))) {
		if (!contains(Succ) && exit != Succ)
		llvm_unreachable("Broken region found: edges leaving the region must go "
		report_fatal_error("Broken region found: edges leaving the region must go "
		"to the exit node!");
		}

		@@ -269,7 +269,7 @@ void RegionBase<Tr>::verifyBBInRegion(BlockT *BB) const {
		for (BlockT *Pred : make_range(InvBlockTraits::child_begin(BB),
		InvBlockTraits::child_end(BB))) {
		if (!contains(Pred))
		llvm_unreachable("Broken region found: edges entering the region must "
		report_fatal_error("Broken region found: edges entering the region must "
		"go to the entry node!");
		}
		}
		@@ -557,7 +557,7 @@ void RegionInfoBase<Tr>::verifyBBMap(const RegionT *R) const {
		} else {
		BlockT *BB = Element->template getNodeAs<BlockT>();
		if (getRegionFor(BB) != R)
		llvm_unreachable("BB map does not match region nesting");
		report_fatal_error("BB map does not match region nesting");
		}
		}
		}

llvm/lib/Transforms/Scalar/StructurizeCFG.cpp

+28 −82

Original line number	Diff line number	Diff line
		@@ -14,7 +14,6 @@
		#include "llvm/ADT/SmallPtrSet.h"
		#include "llvm/ADT/SmallVector.h"
		#include "llvm/Analysis/DivergenceAnalysis.h"
		#include "llvm/Analysis/LoopInfo.h"
		#include "llvm/Analysis/RegionInfo.h"
		#include "llvm/Analysis/RegionIterator.h"
		#include "llvm/Analysis/RegionPass.h"
		@@ -177,9 +176,8 @@ class StructurizeCFG : public RegionPass {
		Region *ParentRegion;

		DominatorTree *DT;
		LoopInfo *LI;

		SmallVector<RegionNode *, 8> Order;
		std::deque<RegionNode *> Order;
		BBSet Visited;

		BBPhiMap DeletedPhis;
		@@ -204,7 +202,7 @@ class StructurizeCFG : public RegionPass {

		void gatherPredicates(RegionNode *N);

		void collectInfos();
		void analyzeNode(RegionNode *N);

		void insertConditions(bool Loops);

		@@ -258,7 +256,6 @@ public:
		AU.addRequired<DivergenceAnalysis>();
		AU.addRequiredID(LowerSwitchID);
		AU.addRequired<DominatorTreeWrapperPass>();
		AU.addRequired<LoopInfoWrapperPass>();

		AU.addPreserved<DominatorTreeWrapperPass>();
		RegionPass::getAnalysisUsage(AU);
		@@ -292,56 +289,18 @@ bool StructurizeCFG::doInitialization(Region *R, RGPassManager &RGM) {

		/// \brief Build up the general order of nodes
		void StructurizeCFG::orderNodes() {
		ReversePostOrderTraversal<Region*> RPOT(ParentRegion);
		SmallDenseMap<Loop*, unsigned, 8> LoopBlocks;
		assert(Visited.empty());
		assert(Predicates.empty());
		assert(Loops.empty());
		assert(LoopPreds.empty());

		// The reverse post-order traversal of the list gives us an ordering close
		// to what we want. The only problem with it is that sometimes backedges
		// for outer loops will be visited before backedges for inner loops.
		for (RegionNode *RN : RPOT) {
		BasicBlock *BB = RN->getEntry();
		Loop *Loop = LI->getLoopFor(BB);
		++LoopBlocks[Loop];
		}

		unsigned CurrentLoopDepth = 0;
		Loop *CurrentLoop = nullptr;
		for (auto I = RPOT.begin(), E = RPOT.end(); I != E; ++I) {
		BasicBlock BB = (I)->getEntry();
		unsigned LoopDepth = LI->getLoopDepth(BB);

		if (is_contained(Order, *I))
		continue;

		if (LoopDepth < CurrentLoopDepth) {
		// Make sure we have visited all blocks in this loop before moving back to
		// the outer loop.

		auto LoopI = I;
		while (unsigned &BlockCount = LoopBlocks[CurrentLoop]) {
		LoopI++;
		BasicBlock LoopBB = (LoopI)->getEntry();
		if (LI->getLoopFor(LoopBB) == CurrentLoop) {
		--BlockCount;
		Order.push_back(*LoopI);
		// This must be RPO order for the back edge detection to work
		for (RegionNode RN : ReversePostOrderTraversal<Region>(ParentRegion)) {
		// FIXME: Is there a better order to use for structurization?
		Order.push_back(RN);
		analyzeNode(RN);
		}
		}
		}

		CurrentLoop = LI->getLoopFor(BB);
		if (CurrentLoop)
		LoopBlocks[CurrentLoop]--;

		CurrentLoopDepth = LoopDepth;
		Order.push_back(*I);
		}

		// This pass originally used a post-order traversal and then operated on
		// the list in reverse. Now that we are using a reverse post-order traversal
		// rather than re-working the whole pass to operate on the list in order,
		// we just reverse the list and continue to operate on it in reverse.
		std::reverse(Order.begin(), Order.end());
		}

		/// \brief Determine the end of the loops
		void StructurizeCFG::analyzeLoops(RegionNode *N) {
		@@ -466,22 +425,10 @@ void StructurizeCFG::gatherPredicates(RegionNode *N) {
		}

		/// \brief Collect various loop and predicate infos
		void StructurizeCFG::collectInfos() {
		// Reset predicate
		Predicates.clear();

		// and loop infos
		Loops.clear();
		LoopPreds.clear();

		// Reset the visited nodes
		Visited.clear();

		for (RegionNode *RN : reverse(Order)) {
		void StructurizeCFG::analyzeNode(RegionNode *RN) {
		DEBUG(dbgs() << "Visiting: "
		<< (RN->isSubRegion() ? "SubRegion with entry: " : "")
		<< RN->getEntry()->getName() << " Loop Depth: "
		<< LI->getLoopDepth(RN->getEntry()) << "\n");
		<< RN->getEntry()->getName() << '\n');

		// Analyze all the conditions leading to a node
		gatherPredicates(RN);
		@@ -492,7 +439,6 @@ void StructurizeCFG::collectInfos() {
		// Find the last back edges
		analyzeLoops(RN);
		}
		}

		/// \brief Insert the missing branch conditions
		void StructurizeCFG::insertConditions(bool Loops) {
		@@ -664,7 +610,7 @@ void StructurizeCFG::changeExit(RegionNode Node, BasicBlock NewExit,
		BasicBlock StructurizeCFG::getNextFlow(BasicBlock Dominator) {
		LLVMContext &Context = Func->getContext();
		BasicBlock *Insert = Order.empty() ? ParentRegion->getExit() :
		Order.back()->getEntry();
		Order.front()->getEntry();
		BasicBlock *Flow = BasicBlock::Create(Context, FlowBlockName,
		Func, Insert);
		DT->addNewBlock(Flow, Dominator);
		@@ -744,7 +690,8 @@ bool StructurizeCFG::isPredictableTrue(RegionNode *Node) {
		/// Take one node from the order vector and wire it up
		void StructurizeCFG::wireFlow(bool ExitUseAllowed,
		BasicBlock *LoopEnd) {
		RegionNode *Node = Order.pop_back_val();
		RegionNode *Node = Order.front();
		Order.pop_front();
		Visited.insert(Node->getEntry());

		if (isPredictableTrue(Node)) {
		@@ -768,7 +715,7 @@ void StructurizeCFG::wireFlow(bool ExitUseAllowed,

		PrevNode = Node;
		while (!Order.empty() && !Visited.count(LoopEnd) &&
		dominatesPredicates(Entry, Order.back())) {
		dominatesPredicates(Entry, Order.front())) {
		handleLoops(false, LoopEnd);
		}

		@@ -779,7 +726,7 @@ void StructurizeCFG::wireFlow(bool ExitUseAllowed,

		void StructurizeCFG::handleLoops(bool ExitUseAllowed,
		BasicBlock *LoopEnd) {
		RegionNode *Node = Order.back();
		RegionNode *Node = Order.front();
		BasicBlock *LoopStart = Node->getEntry();

		if (!Loops.count(LoopStart)) {
		@@ -924,10 +871,9 @@ bool StructurizeCFG::runOnRegion(Region *R, RGPassManager &RGM) {
		ParentRegion = R;

		DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
		LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();

		orderNodes();
		collectInfos();

		createFlow();
		insertConditions(false);
		insertConditions(true);

llvm/test/CodeGen/AMDGPU/multilevel-break.ll

+2 −1

Original line number	Diff line number	Diff line
		@@ -66,9 +66,10 @@ ENDIF: ; preds = %LOOP

		; OPT-LABEL: define amdgpu_kernel void @multi_if_break_loop(
		; OPT: llvm.amdgcn.break
		; OPT: llvm.amdgcn.loop
		; OPT: llvm.amdgcn.break
		; OPT: llvm.amdgcn.if.break
		; OPT: llvm.amdgcn.if.break
		; OPT: llvm.amdgcn.loop
		; OPT: llvm.amdgcn.end.cf

		; GCN-LABEL: {{^}}multi_if_break_loop:

llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll

+86 −41

Original line number	Diff line number	Diff line
		@@ -124,55 +124,100 @@ bb23: ; preds = %bb10
		; Earlier version of above, before a run of the structurizer.
		; IR-LABEL: @nested_loop_conditions(

		; IR: Flow7:
		; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %17)
		; IR-NEXT: %0 = call { i1, i64 } @llvm.amdgcn.if(i1 %15)
		; IR-NEXT: %1 = extractvalue { i1, i64 } %0, 0
		; IR-NEXT: %2 = extractvalue { i1, i64 } %0, 1
		; IR-NEXT: br i1 %1, label %bb4.bb13_crit_edge, label %Flow8
		; IR: %tmp1235 = icmp slt i32 %tmp1134, 9
		; IR: br i1 %tmp1235, label %bb14.lr.ph, label %Flow

		; IR: bb14.lr.ph:
		; IR: br label %bb14

		; IR: Flow3:
		; IR: call void @llvm.amdgcn.end.cf(i64 %18)
		; IR: %0 = call { i1, i64 } @llvm.amdgcn.if(i1 %17)
		; IR: %1 = extractvalue { i1, i64 } %0, 0
		; IR: %2 = extractvalue { i1, i64 } %0, 1
		; IR: br i1 %1, label %bb4.bb13_crit_edge, label %Flow4

		; IR: bb4.bb13_crit_edge:
		; IR: br label %Flow4

		; IR: Flow4:
		; IR: %3 = phi i1 [ true, %bb4.bb13_crit_edge ], [ false, %Flow3 ]
		; IR: call void @llvm.amdgcn.end.cf(i64 %2)
		; IR: br label %Flow

		; IR: bb13:
		; IR: br label %bb31

		; IR: Flow:
		; IR: %4 = phi i1 [ %3, %Flow4 ], [ true, %bb ]
		; IR: %5 = call { i1, i64 } @llvm.amdgcn.if(i1 %4)
		; IR: %6 = extractvalue { i1, i64 } %5, 0
		; IR: %7 = extractvalue { i1, i64 } %5, 1
		; IR: br i1 %6, label %bb13, label %bb31

		; IR: bb14:
		; IR: %phi.broken = phi i64 [ %18, %Flow2 ], [ 0, %bb14.lr.ph ]
		; IR: %tmp1037 = phi i32 [ %tmp1033, %bb14.lr.ph ], [ %16, %Flow2 ]
		; IR: %tmp936 = phi <4 x i32> [ %tmp932, %bb14.lr.ph ], [ %15, %Flow2 ]
		; IR: %tmp15 = icmp eq i32 %tmp1037, 1
		; IR: %8 = xor i1 %tmp15, true
		; IR: %9 = call { i1, i64 } @llvm.amdgcn.if(i1 %8)
		; IR: %10 = extractvalue { i1, i64 } %9, 0
		; IR: %11 = extractvalue { i1, i64 } %9, 1
		; IR: br i1 %10, label %bb31.loopexit, label %Flow1

		; IR: Flow1:
		; IR-NEXT: %loop.phi = phi i64 [ %loop.phi9, %Flow6 ], [ %phi.broken, %bb14 ]
		; IR-NEXT: %13 = phi <4 x i32> [ %29, %Flow6 ], [ undef, %bb14 ]
		; IR-NEXT: %14 = phi i32 [ %30, %Flow6 ], [ undef, %bb14 ]
		; IR-NEXT: %15 = phi i1 [ %31, %Flow6 ], [ false, %bb14 ]
		; IR-NEXT: %16 = phi i1 [ false, %Flow6 ], [ %8, %bb14 ]
		; IR-NEXT: %17 = call i64 @llvm.amdgcn.else.break(i64 %11, i64 %loop.phi)
		; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %11)
		; IR-NEXT: %18 = call i1 @llvm.amdgcn.loop(i64 %17)
		; IR-NEXT: br i1 %18, label %Flow7, label %bb14
		; IR: %12 = call { i1, i64 } @llvm.amdgcn.else(i64 %11)
		; IR: %13 = extractvalue { i1, i64 } %12, 0
		; IR: %14 = extractvalue { i1, i64 } %12, 1
		; IR: br i1 %13, label %bb16, label %Flow2

		; IR: bb16:
		; IR: %tmp17 = bitcast i64 %tmp3 to <2 x i32>
		; IR: br label %bb18

		; IR: Flow2:
		; IR-NEXT: %loop.phi10 = phi i64 [ %loop.phi11, %Flow5 ], [ %12, %bb16 ]
		; IR-NEXT: %19 = phi <4 x i32> [ %29, %Flow5 ], [ undef, %bb16 ]
		; IR-NEXT: %20 = phi i32 [ %30, %Flow5 ], [ undef, %bb16 ]
		; IR-NEXT: %21 = phi i1 [ %31, %Flow5 ], [ false, %bb16 ]
		; IR-NEXT: %22 = phi i1 [ false, %Flow5 ], [ false, %bb16 ]
		; IR-NEXT: %23 = phi i1 [ false, %Flow5 ], [ %8, %bb16 ]
		; IR-NEXT: %24 = call { i1, i64 } @llvm.amdgcn.if(i1 %23)
		; IR-NEXT: %25 = extractvalue { i1, i64 } %24, 0
		; IR-NEXT: %26 = extractvalue { i1, i64 } %24, 1
		; IR-NEXT: br i1 %25, label %bb21, label %Flow3
		; IR: %loop.phi = phi i64 [ %21, %bb21 ], [ %phi.broken, %Flow1 ]
		; IR: %15 = phi <4 x i32> [ %tmp9, %bb21 ], [ undef, %Flow1 ]
		; IR: %16 = phi i32 [ %tmp10, %bb21 ], [ undef, %Flow1 ]
		; IR: %17 = phi i1 [ %20, %bb21 ], [ false, %Flow1 ]
		; IR: %18 = call i64 @llvm.amdgcn.else.break(i64 %14, i64 %loop.phi)
		; IR: call void @llvm.amdgcn.end.cf(i64 %14)
		; IR: %19 = call i1 @llvm.amdgcn.loop(i64 %18)
		; IR: br i1 %19, label %Flow3, label %bb14

		; IR: bb18:
		; IR: %tmp19 = load volatile i32, i32 addrspace(1)* undef
		; IR: %tmp20 = icmp slt i32 %tmp19, 9
		; IR: br i1 %tmp20, label %bb21, label %bb18

		; IR: bb21:
		; IR: %tmp22 = extractelement <2 x i32> %tmp17, i64 1
		; IR: %tmp23 = lshr i32 %tmp22, 16
		; IR: %tmp24 = select i1 undef, i32 undef, i32 %tmp23
		; IR: %tmp25 = uitofp i32 %tmp24 to float
		; IR: %tmp26 = fmul float %tmp25, 0x3EF0001000000000
		; IR: %tmp27 = fsub float %tmp26, undef
		; IR: %tmp28 = fcmp olt float %tmp27, 5.000000e-01
		; IR: %tmp29 = select i1 %tmp28, i64 1, i64 2
		; IR: %tmp30 = extractelement <4 x i32> %tmp936, i64 %tmp29
		; IR: %tmp7 = zext i32 %tmp30 to i64
		; IR: %tmp8 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* undef, i64 %tmp7
		; IR: %tmp9 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp8, align 16
		; IR: %tmp10 = extractelement <4 x i32> %tmp9, i64 0
		; IR: %tmp11 = load volatile i32, i32 addrspace(1)* undef
		; IR: %tmp12 = icmp slt i32 %tmp11, 9
		; IR-NEXT: %27 = xor i1 %tmp12, true
		; IR-NEXT: %28 = call i64 @llvm.amdgcn.if.break(i1 %27, i64 %phi.broken)
		; IR-NEXT: br label %Flow3
		; IR: %20 = xor i1 %tmp12, true
		; IR: %21 = call i64 @llvm.amdgcn.if.break(i1 %20, i64 %phi.broken)
		; IR: br label %Flow2

		; IR: Flow3:
		; IR-NEXT: %loop.phi11 = phi i64 [ %phi.broken, %bb21 ], [ %phi.broken, %Flow2 ]
		; IR-NEXT: %loop.phi9 = phi i64 [ %28, %bb21 ], [ %loop.phi10, %Flow2 ]
		; IR-NEXT: %29 = phi <4 x i32> [ %tmp9, %bb21 ], [ %19, %Flow2 ]
		; IR-NEXT: %30 = phi i32 [ %tmp10, %bb21 ], [ %20, %Flow2 ]
		; IR-NEXT: %31 = phi i1 [ %27, %bb21 ], [ %21, %Flow2 ]
		; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %26)
		; IR-NEXT: br i1 %22, label %bb31.loopexit, label %Flow4
		; IR: bb31.loopexit:
		; IR: br label %Flow1

		; IR: bb31:
		; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %7)
		; IR-NEXT: store volatile i32 0, i32 addrspace(1)* undef
		; IR-NEXT: ret void
		; IR: call void @llvm.amdgcn.end.cf(i64 %7)
		; IR: store volatile i32 0, i32 addrspace(1)* undef
		; IR: ret void


		; GCN-LABEL: {{^}}nested_loop_conditions:

llvm/test/Transforms/StructurizeCFG/AMDGPU/backedge-id-bug-xfail.ll

0 → 100644

+77 −0

Original line number	Diff line number	Diff line
		; XFAIL: *
		; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -structurizecfg -verify-region-info %s

		; FIXME: Merge into backedge-id-bug
		; Variant which has an issue with region construction

		define amdgpu_kernel void @loop_backedge_misidentified_alt(i32 addrspace(1)* %arg0) #0 {
		entry:
		%tmp = load volatile <2 x i32>, <2 x i32> addrspace(1)* undef, align 16
		%load1 = load volatile <2 x float>, <2 x float> addrspace(1)* undef
		%tid = call i32 @llvm.amdgcn.workitem.id.x()
		%gep = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i32 %tid
		%i.initial = load volatile i32, i32 addrspace(1)* %gep, align 4
		br label %LOOP.HEADER

		LOOP.HEADER:
		%i = phi i32 [ %i.final, %END_ELSE_BLOCK ], [ %i.initial, %entry ]
		call void asm sideeffect "s_nop 0x100b ; loop $0 ", "r,~{memory}"(i32 %i) #0
		%tmp12 = zext i32 %i to i64
		%tmp13 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* null, i64 %tmp12
		%tmp14 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp13, align 16
		%tmp15 = extractelement <4 x i32> %tmp14, i64 0
		%tmp16 = and i32 %tmp15, 65535
		%tmp17 = icmp eq i32 %tmp16, 1
		br i1 %tmp17, label %bb18, label %bb62

		bb18:
		%tmp19 = extractelement <2 x i32> %tmp, i64 0
		%tmp22 = lshr i32 %tmp19, 16
		%tmp24 = urem i32 %tmp22, 52
		%tmp25 = mul nuw nsw i32 %tmp24, 52
		br label %INNER_LOOP

		INNER_LOOP:
		%inner.loop.j = phi i32 [ %tmp25, %bb18 ], [ %inner.loop.j.inc, %INNER_LOOP ]
		call void asm sideeffect "; inner loop body", ""() #0
		%inner.loop.j.inc = add nsw i32 %inner.loop.j, 1
		%inner.loop.cmp = icmp eq i32 %inner.loop.j, 0
		br i1 %inner.loop.cmp, label %INNER_LOOP_BREAK, label %INNER_LOOP

		INNER_LOOP_BREAK:
		%tmp59 = extractelement <4 x i32> %tmp14, i64 2
		call void asm sideeffect "s_nop 23 ", "~{memory}"() #0
		br label %END_ELSE_BLOCK

		bb62:
		%load13 = icmp ult i32 %tmp16, 271
		;br i1 %load13, label %bb64, label %INCREMENT_I
		; branching directly to the return avoids the bug
		br i1 %load13, label %RETURN, label %INCREMENT_I


		bb64:
		call void asm sideeffect "s_nop 42", "~{memory}"() #0
		br label %RETURN

		INCREMENT_I:
		%inc.i = add i32 %i, 1
		call void asm sideeffect "s_nop 0x1336 ; increment $0", "v,~{memory}"(i32 %inc.i) #0
		br label %END_ELSE_BLOCK

		END_ELSE_BLOCK:
		%i.final = phi i32 [ %tmp59, %INNER_LOOP_BREAK ], [ %inc.i, %INCREMENT_I ]
		call void asm sideeffect "s_nop 0x1337 ; end else block $0", "v,~{memory}"(i32 %i.final) #0
		%cmp.end.else.block = icmp eq i32 %i.final, -1
		br i1 %cmp.end.else.block, label %RETURN, label %LOOP.HEADER

		RETURN:
		call void asm sideeffect "s_nop 0x99 ; ClosureEval return", "~{memory}"() #0
		store volatile <2 x float> %load1, <2 x float> addrspace(1)* undef, align 8
		ret void
		}

		declare i32 @llvm.amdgcn.workitem.id.x() #1

		attributes #0 = { convergent nounwind }
		attributes #1 = { convergent nounwind readnone }