Merging r261368: (05d9e8b7) · Commits · llvm-doe / llvm-project

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

+31 −106

Original line number	Diff line number	Diff line
		@@ -1409,14 +1409,15 @@ private:
		/// different operations.
		class LoopVectorizationCostModel {
		public:
		LoopVectorizationCostModel(Loop *L, PredicatedScalarEvolution &PSE,
		LoopInfo LI, LoopVectorizationLegality Legal,
		LoopVectorizationCostModel(Loop L, ScalarEvolution SE, LoopInfo *LI,
		LoopVectorizationLegality *Legal,
		const TargetTransformInfo &TTI,
		const TargetLibraryInfo TLI, DemandedBits DB,
		AssumptionCache AC, const Function F,
		const LoopVectorizeHints *Hints)
		: TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB),
		AC(AC), TheFunction(F), Hints(Hints) {}
		const LoopVectorizeHints *Hints,
		SmallPtrSetImpl<const Value *> &ValuesToIgnore)
		: TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB),
		TheFunction(F), Hints(Hints), ValuesToIgnore(ValuesToIgnore) {}

		/// Information about vectorization costs
		struct VectorizationFactor {
		@@ -1464,9 +1465,6 @@ public:
		SmallVector<RegisterUsage, 8>
		calculateRegisterUsage(const SmallVector<unsigned, 8> &VFs);

		/// Collect values we want to ignore in the cost model.
		void collectValuesToIgnore();

		private:
		/// Returns the expected execution cost. The unit of the cost does
		/// not matter because we use the 'cost' units to compare different
		@@ -1498,8 +1496,8 @@ public:

		/// The loop that we evaluate.
		Loop *TheLoop;
		/// Predicated scalar evolution analysis.
		PredicatedScalarEvolution &PSE;
		/// Scev analysis.
		ScalarEvolution *SE;
		/// Loop Info analysis.
		LoopInfo *LI;
		/// Vectorization legality.
		@@ -1508,17 +1506,13 @@ public:
		const TargetTransformInfo &TTI;
		/// Target Library Info.
		const TargetLibraryInfo *TLI;
		/// Demanded bits analysis.
		/// Demanded bits analysis
		DemandedBits *DB;
		/// Assumption cache.
		AssumptionCache *AC;
		const Function *TheFunction;
		/// Loop Vectorize Hint.
		// Loop Vectorize Hint.
		const LoopVectorizeHints *Hints;
		/// Values to ignore in the cost model.
		SmallPtrSet<const Value *, 16> ValuesToIgnore;
		/// Values to ignore in the cost model when VF > 1.
		SmallPtrSet<const Value *, 16> VecValuesToIgnore;
		// Values to ignore in the cost model.
		const SmallPtrSetImpl<const Value *> &ValuesToIgnore;
		};

		/// \brief This holds vectorization requirements that must be verified late in
		@@ -1763,10 +1757,19 @@ struct LoopVectorize : public FunctionPass {
		return false;
		}

		// Collect values we want to ignore in the cost model. This includes
		// type-promoting instructions we identified during reduction detection.
		SmallPtrSet<const Value *, 32> ValuesToIgnore;
		CodeMetrics::collectEphemeralValues(L, AC, ValuesToIgnore);
		for (auto &Reduction : *LVL.getReductionVars()) {
		RecurrenceDescriptor &RedDes = Reduction.second;
		SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
		ValuesToIgnore.insert(Casts.begin(), Casts.end());
		}

		// Use the cost model.
		LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, F,
		&Hints);
		CM.collectValuesToIgnore();
		LoopVectorizationCostModel CM(L, PSE.getSE(), LI, &LVL, *TTI, TLI, DB, AC,
		F, &Hints, ValuesToIgnore);

		// Check the function attributes to find out if this function should be
		// optimized for size.
		@@ -4744,7 +4747,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
		}

		// Find the trip count.
		unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
		unsigned TC = SE->getSmallConstantTripCount(TheLoop);
		DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');

		MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
		@@ -4946,7 +4949,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
		return 1;

		// Do not interleave loops with a relatively small trip count.
		unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
		unsigned TC = SE->getSmallConstantTripCount(TheLoop);
		if (TC > 1 && TC < TinyTripCountInterleaveThreshold)
		return 1;

		@@ -5174,15 +5177,15 @@ LoopVectorizationCostModel::calculateRegisterUsage(
		// Ignore instructions that are never used within the loop.
		if (!Ends.count(I)) continue;

		// Skip ignored values.
		if (ValuesToIgnore.count(I))
		continue;

		// Remove all of the instructions that end at this location.
		InstrList &List = TransposeEnds[i];
		for (unsigned int j = 0, e = List.size(); j < e; ++j)
		OpenIntervals.erase(List[j]);

		// Skip ignored values.
		if (ValuesToIgnore.count(I))
		continue;

		// For each VF find the maximum usage of registers.
		for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
		if (VFs[j] == 1) {
		@@ -5192,12 +5195,8 @@ LoopVectorizationCostModel::calculateRegisterUsage(

		// Count the number of live intervals.
		unsigned RegUsage = 0;
		for (auto Inst : OpenIntervals) {
		// Skip ignored values for VF > 1.
		if (VecValuesToIgnore.count(Inst))
		continue;
		for (auto Inst : OpenIntervals)
		RegUsage += GetRegUsage(Inst->getType(), VFs[j]);
		}
		MaxUsages[j] = std::max(MaxUsages[j], RegUsage);
		}

		@@ -5341,7 +5340,6 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
		if (VF > 1 && MinBWs.count(I))
		RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
		Type *VectorTy = ToVectorTy(RetTy, VF);
		auto SE = PSE.getSE();

		// TODO: We need to estimate the cost of intrinsic calls.
		switch (I->getOpcode()) {
		@@ -5643,79 +5641,6 @@ bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
		return false;
		}

		void LoopVectorizationCostModel::collectValuesToIgnore() {
		// Ignore ephemeral values.
		CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);

		// Ignore type-promoting instructions we identified during reduction
		// detection.
		for (auto &Reduction : *Legal->getReductionVars()) {
		RecurrenceDescriptor &RedDes = Reduction.second;
		SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
		VecValuesToIgnore.insert(Casts.begin(), Casts.end());
		}

		// Ignore induction phis that are only used in either GetElementPtr or ICmp
		// instruction to exit loop. Induction variables usually have large types and
		// can have big impact when estimating register usage.
		// This is for when VF > 1.
		for (auto &Induction : *Legal->getInductionVars()) {
		auto *PN = Induction.first;
		auto *UpdateV = PN->getIncomingValueForBlock(TheLoop->getLoopLatch());

		// Check that the PHI is only used by the induction increment (UpdateV) or
		// by GEPs. Then check that UpdateV is only used by a compare instruction or
		// the loop header PHI.
		// FIXME: Need precise def-use analysis to determine if this instruction
		// variable will be vectorized.
		if (std::all_of(PN->user_begin(), PN->user_end(),
		[&](const User *U) -> bool {
		return U == UpdateV \|\| isa<GetElementPtrInst>(U);
		}) &&
		std::all_of(UpdateV->user_begin(), UpdateV->user_end(),
		[&](const User *U) -> bool {
		return U == PN \|\| isa<ICmpInst>(U);
		})) {
		VecValuesToIgnore.insert(PN);
		VecValuesToIgnore.insert(UpdateV);
		}
		}

		// Ignore instructions that will not be vectorized.
		// This is for when VF > 1.
		for (auto bb = TheLoop->block_begin(), be = TheLoop->block_end(); bb != be;
		++bb) {
		for (auto &Inst : **bb) {
		switch (Inst.getOpcode()) {
		case Instruction::GetElementPtr: {
		// Ignore GEP if its last operand is an induction variable so that it is
		// a consecutive load/store and won't be vectorized as scatter/gather
		// pattern.

		GetElementPtrInst *Gep = cast<GetElementPtrInst>(&Inst);
		unsigned NumOperands = Gep->getNumOperands();
		unsigned InductionOperand = getGEPInductionOperand(Gep);
		bool GepToIgnore = true;

		// Check that all of the gep indices are uniform except for the
		// induction operand.
		for (unsigned i = 0; i != NumOperands; ++i) {
		if (i != InductionOperand &&
		!PSE.getSE()->isLoopInvariant(PSE.getSCEV(Gep->getOperand(i)),
		TheLoop)) {
		GepToIgnore = false;
		break;
		}
		}

		if (GepToIgnore)
		VecValuesToIgnore.insert(&Inst);
		break;
		}
		}
		}
		}
		}

		void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,
		bool IfPredicateStore) {

llvm/test/Transforms/LoopVectorize/X86/reg-usage.ll

deleted100644 → 0

+0 −71

Original line number	Diff line number	Diff line
		; RUN: opt < %s -debug-only=loop-vectorize -loop-vectorize -vectorizer-maximize-bandwidth -O2 -S 2>&1 \| FileCheck %s
		; REQUIRES: asserts

		target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
		target triple = "x86_64-unknown-linux-gnu"

		@a = global [1024 x i8] zeroinitializer, align 16
		@b = global [1024 x i8] zeroinitializer, align 16

		define i32 @foo() {
		; This function has a loop of SAD pattern. Here we check when VF = 16 the
		; register usage doesn't exceed 16.
		;
		; CHECK-LABEL: foo
		; CHECK: LV(REG): VF = 4
		; CHECK-NEXT: LV(REG): Found max usage: 4
		; CHECK: LV(REG): VF = 8
		; CHECK-NEXT: LV(REG): Found max usage: 7
		; CHECK: LV(REG): VF = 16
		; CHECK-NEXT: LV(REG): Found max usage: 13

		entry:
		br label %for.body

		for.cond.cleanup:
		%add.lcssa = phi i32 [ %add, %for.body ]
		ret i32 %add.lcssa

		for.body:
		%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
		%s.015 = phi i32 [ 0, %entry ], [ %add, %for.body ]
		%arrayidx = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %indvars.iv
		%0 = load i8, i8* %arrayidx, align 1
		%conv = zext i8 %0 to i32
		%arrayidx2 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %indvars.iv
		%1 = load i8, i8* %arrayidx2, align 1
		%conv3 = zext i8 %1 to i32
		%sub = sub nsw i32 %conv, %conv3
		%ispos = icmp sgt i32 %sub, -1
		%neg = sub nsw i32 0, %sub
		%2 = select i1 %ispos, i32 %sub, i32 %neg
		%add = add nsw i32 %2, %s.015
		%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
		%exitcond = icmp eq i64 %indvars.iv.next, 1024
		br i1 %exitcond, label %for.cond.cleanup, label %for.body
		}

		define i64 @bar(i64* nocapture %a) {
		; CHECK-LABEL: bar
		; CHECK: LV(REG): VF = 2
		; CHECK: LV(REG): Found max usage: 4
		;
		entry:
		br label %for.body

		for.cond.cleanup:
		%add2.lcssa = phi i64 [ %add2, %for.body ]
		ret i64 %add2.lcssa

		for.body:
		%i.012 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
		%s.011 = phi i64 [ 0, %entry ], [ %add2, %for.body ]
		%arrayidx = getelementptr inbounds i64, i64* %a, i64 %i.012
		%0 = load i64, i64* %arrayidx, align 8
		%add = add nsw i64 %0, %i.012
		store i64 %add, i64* %arrayidx, align 8
		%add2 = add nsw i64 %add, %s.011
		%inc = add nuw nsw i64 %i.012, 1
		%exitcond = icmp eq i64 %inc, 1024
		br i1 %exitcond, label %for.cond.cleanup, label %for.body
		}

llvm/test/Transforms/LoopVectorize/X86/vector_max_bandwidth.ll

+1 −1

Original line number	Diff line number	Diff line
		@@ -16,7 +16,7 @@ target triple = "x86_64-unknown-linux-gnu"
		; -vectorizer-maximize-bandwidth is indicated.
		;
		; CHECK-label: foo
		; CHECK: LV: Selecting VF: 32.
		; CHECK: LV: Selecting VF: 16.
		define void @foo() {
		entry:
		br label %for.body