[LV] Scalar with predication must not be uniform (6ed9cef2) · Commits · llvm-doe / llvm-project

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

+22 −17

Original line number	Diff line number	Diff line
		@@ -4668,14 +4668,26 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
		SetVector<Instruction *> Worklist;
		BasicBlock *Latch = TheLoop->getLoopLatch();

		// Instructions that are scalar with predication must not be considered
		// uniform after vectorization, because that would create an erroneous
		// replicating region where only a single instance out of VF should be formed.
		// TODO: optimize such seldom cases if found important, see PR40816.
		auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
		if (isScalarWithPredication(I, VF)) {
		LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
		<< *I << "\n");
		return;
		}
		LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
		Worklist.insert(I);
		};

		// Start with the conditional branch. If the branch condition is an
		// instruction contained in the loop that is only used by the branch, it is
		// uniform.
		auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
		if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) {
		Worklist.insert(Cmp);
		LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n");
		}
		if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
		addToWorklistIfAllowed(Cmp);

		// Holds consecutive and consecutive-like pointers. Consecutive-like pointers
		// are pointers that are treated like consecutive pointers during
		@@ -4734,10 +4746,8 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
		// Add to the Worklist all consecutive and consecutive-like pointers that
		// aren't also identified as possibly non-uniform.
		for (auto *V : ConsecutiveLikePtrs)
		if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end()) {
		LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n");
		Worklist.insert(V);
		}
		if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end())
		addToWorklistIfAllowed(V);

		// Expand Worklist in topological order: whenever a new instruction
		// is added , its users should be already inside Worklist. It ensures
		@@ -4763,10 +4773,8 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
		return Worklist.count(J) \|\|
		(OI == getLoadStorePointerOperand(J) &&
		isUniformDecision(J, VF));
		})) {
		Worklist.insert(OI);
		LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n");
		}
		}))
		addToWorklistIfAllowed(OI);
		}
		}

		@@ -4808,11 +4816,8 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
		continue;

		// The induction variable and its update instruction will remain uniform.
		Worklist.insert(Ind);
		Worklist.insert(IndUpdate);
		LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ind << "\n");
		LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate
		<< "\n");
		addToWorklistIfAllowed(Ind);
		addToWorklistIfAllowed(IndUpdate);
		}

		Uniforms[VF].insert(Worklist.begin(), Worklist.end());

llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll

+83 −0

Original line number	Diff line number	Diff line
		; REQUIRES: asserts
		; RUN: opt < %s -loop-vectorize -instcombine -S -debug-only=loop-vectorize -disable-output -print-after=instcombine 2>&1 \| FileCheck %s
		; RUN: opt < %s -loop-vectorize -force-vector-width=2 -S \| FileCheck %s -check-prefix=FORCE

		target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
		target triple = "x86_64-unknown-linux-gnu"
		@@ -65,3 +66,85 @@ for.end:
		}

		attributes #0 = { "target-cpu"="knl" }

		; CHECK-LABEL: PR40816
		;
		; Check that scalar with predication instructions are not considered uniform
		; after vectorization, because that results in replicating a region instead of
		; having a single instance (out of VF). The predication stems from a tiny count
		; of 3 leading to folding the tail by masking using icmp ule <i, i+1> <= <2, 2>.
		;
		; CHECK: LV: Found trip count: 3
		; CHECK: LV: Found uniform instruction: {{%.}} = icmp eq i32 {{%.}}, 0
		; CHECK-NOT: LV: Found uniform instruction: {{%.}} = load i32, i32 {{%.*}}, align 1
		; CHECK: LV: Found not uniform being ScalarWithPredication: {{%.}} = load i32, i32 {{%.*}}, align 1
		; CHECK: LV: Found scalar instruction: {{%.}} = getelementptr inbounds [3 x i32], [3 x i32] @a, i32 0, i32 {{%.*}}
		;
		; FORCE-LABEL: @PR40816(
		; FORCE-NEXT: entry:
		; FORCE-NEXT: br i1 false, label {{%.}}, label [[VECTOR_PH:%.]]
		; FORCE: vector.ph:
		; FORCE-NEXT: br label [[VECTOR_BODY:%.*]]
		; FORCE: vector.body:
		; FORCE-NEXT: [[INDEX:%.]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.]], [[PRED_LOAD_CONTINUE4:%.*]] ]
		; FORCE-NEXT: [[VEC_IND:%.]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.]], [[PRED_LOAD_CONTINUE4]] ]
		; FORCE-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0
		; FORCE-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 1
		; FORCE-NEXT: [[TMP2:%.*]] = icmp ule <2 x i32> [[VEC_IND]], <i32 2, i32 2>
		; FORCE-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
		; FORCE-NEXT: br i1 [[TMP3]], label [[PRED_STORE_IF:%.]], label [[PRED_STORE_CONTINUE:%.]]
		; FORCE: pred.store.if:
		; FORCE-NEXT: store i32 [[TMP0]], i32* @b, align 1
		; FORCE-NEXT: br label [[PRED_STORE_CONTINUE]]
		; FORCE: pred.store.continue:
		; FORCE-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1
		; FORCE-NEXT: br i1 [[TMP4]], label [[PRED_STORE_IF1:%.]], label [[PRED_STORE_CONTINUE2:%.]]
		; FORCE: pred.store.if1:
		; FORCE-NEXT: store i32 [[TMP1]], i32* @b, align 1
		; FORCE-NEXT: br label [[PRED_STORE_CONTINUE2]]
		; FORCE: pred.store.continue2:
		; FORCE-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
		; FORCE-NEXT: br i1 [[TMP5]], label [[PRED_LOAD_IF:%.]], label [[PRED_LOAD_CONTINUE:%.]]
		; FORCE: pred.load.if:
		; FORCE-NEXT: [[TMP6:%.]] = getelementptr inbounds [3 x i32], [3 x i32] @a, i32 0, i32 [[TMP0]]
		; FORCE-NEXT: [[TMP7:%.]] = load i32, i32 [[TMP6]], align 1
		; FORCE-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> undef, i32 [[TMP7]], i32 0
		; FORCE-NEXT: br label [[PRED_LOAD_CONTINUE]]
		; FORCE: pred.load.continue:
		; FORCE-NEXT: [[TMP9:%.*]] = phi <2 x i32> [ undef, [[PRED_STORE_CONTINUE2]] ], [ [[TMP8]], [[PRED_LOAD_IF]] ]
		; FORCE-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1
		; FORCE-NEXT: br i1 [[TMP10]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4]]
		; FORCE: pred.load.if3:
		; FORCE-NEXT: [[TMP11:%.]] = getelementptr inbounds [3 x i32], [3 x i32] @a, i32 0, i32 [[TMP1]]
		; FORCE-NEXT: [[TMP12:%.]] = load i32, i32 [[TMP11]], align 1
		; FORCE-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP12]], i32 1
		; FORCE-NEXT: br label [[PRED_LOAD_CONTINUE4]]
		; FORCE: pred.load.continue4:
		; FORCE-NEXT: [[TMP14:%.*]] = phi <2 x i32> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP13]], [[PRED_LOAD_IF3]] ]
		; FORCE-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 2
		; FORCE-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
		; FORCE-NEXT: [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4
		; FORCE-NEXT: br i1 [[TMP15]], label {{%.*}}, label [[VECTOR_BODY]]
		;
		@a = internal constant [3 x i32] [i32 7, i32 7, i32 0], align 1
		@b = external global i32, align 1

		define void @PR40816() #1 {

		entry:
		br label %for.body

		for.body: ; preds = %for.body, %entry
		%0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
		store i32 %0, i32* @b, align 1
		%arrayidx1 = getelementptr inbounds [3 x i32], [3 x i32]* @a, i32 0, i32 %0
		%1 = load i32, i32* %arrayidx1, align 1
		%cmp2 = icmp eq i32 %1, 0
		%inc = add nuw nsw i32 %0, 1
		br i1 %cmp2, label %return, label %for.body

		return: ; preds = %for.body
		ret void
		}

		attributes #1 = { "target-cpu"="core2" }