[SLP] Refine loop-aware gather cost and admit sibling-loop subtrees (c1b347ea) · Commits · llvm-doe / llvm-project

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

+181 −47

Original line number	Diff line number	Diff line
		@@ -259,6 +259,18 @@ static cl::opt<unsigned> LoopAwareTripCount(
		cl::desc("Loop trip count, considered by the cost model during "
		"modeling (0=loops are ignored and considered flat code)"));

		/// Refine the loop-aware cost scaling of gather/buildvector tree entries by
		/// using the per-lane execution scale of the operand that feeds each lane,
		/// instead of a single whole-entry scale. This matches the LICM hoisting
		/// performed by optimizeGatherSequence() at codegen time: lanes whose
		/// operands are loop-invariant in an inner loop contribute the outer loop's
		/// execution scale rather than the inner loop's, which avoids over-costing
		/// buildvectors that bridge values from outer loop nests into an inner loop.
		static cl::opt<bool> PerLaneGatherScale(
		"slp-per-lane-gather-scale", cl::init(true), cl::Hidden,
		cl::desc("Use per-lane execution scale for gather/buildvector tree "
		"entries to model LICM-hoistable buildvector sequences."));

		// Limit the number of alias checks. The limit is chosen so that
		// it has no negative effect on the llvm benchmarks.
		static const unsigned AliasedCheckLimit = 10;
		@@ -3942,10 +3954,27 @@ private:
		/// external use.
		/// \p U is the user of the vectorized value from the entry, if using the
		/// parent for the external use.
		unsigned getScaleToLoopIterations(const TreeEntry &TE,
		uint64_t getScaleToLoopIterations(const TreeEntry &TE,
		Value *Scalar = nullptr,
		Instruction *U = nullptr);

		/// \returns the product of trip counts of the loop \p L and all of its
		/// enclosing loops. Unlike the state kept by getScaleToLoopIterations(),
		/// this helper depends only on the loop structure and is independent of
		/// per-entry operand invariance. Returns 1 when loop-aware cost modeling
		/// is disabled or \p L is null.
		uint64_t getLoopNestScale(const Loop *L);

		/// \returns a refined execution scale for a gather/buildvector tree entry
		/// \p TE. The scale is computed as the average of per-lane execution
		/// scales: each lane's scale is the loop-nest scale of the loop that
		/// contains the lane's defining instruction (or 1 if the lane is a
		/// constant / loop-invariant non-instruction value). This models the
		/// LICM hoisting that optimizeGatherSequence() performs after vectorization
		/// for inserts with loop-invariant operands. Falls back to the whole-entry
		/// scale when per-lane information is unavailable or the feature is off.
		uint64_t getGatherNodeEffectiveScale(const TreeEntry &TE);

		/// Get the loop nest for the given loop \p L.
		ArrayRef<const Loop > getLoopNest(const Loop L);

		@@ -4904,9 +4933,10 @@ private:
		/// Maps the loops to their loop nests.
		SmallDenseMap<const Loop , SmallVector<const Loop >> LoopToLoopNest;

		/// Maps the loops to their scale factor, which is built as a multiplication
		/// of the tripcounts of the loops in the loop nest.
		SmallDenseMap<const Loop *, unsigned> LoopToScaleFactor;
		/// Per-loop cache of nest scale factors: the product of trip counts of the
		/// loop and all of its ancestors. Shared by getLoopNestScale() and (via it)
		/// by getScaleToLoopIterations() and getGatherNodeEffectiveScale().
		SmallDenseMap<const Loop *, uint64_t> LoopNestScaleCache;

		/// This POD struct describes one external user in the vectorized tree.
		struct ExternalUser {
		@@ -12645,22 +12675,56 @@ void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
		S.getMainOp()->getParent()) {
		BasicBlock *Parent = S.getMainOp()->getParent();
		if (const Loop *L = LI->getLoopFor(Parent)) {
		// Check that the new loop nest is not involved.
		// Otherwise, mark it as a gather node.
		// Check that the new loop nest shares the same outer structure as the
		// tree's current loop nest. Completely disjoint nests (different
		// outermost loops) are forced to gather because their scales cannot be
		// meaningfully combined. Sibling inner loops (inside a common outer
		// loop or outside any loops at all) are allowed: the cost model scales
		// each entry by its own loop via getScaleToLoopIterations(), so a tree
		// that spans sibling inner loops (e.g. a PHI at their merge block) can
		// still be costed correctly. Contract CurrentLoopNest to the longest
		// common prefix with the new entry's nest so subsequent entries in yet
		// another sibling can also be admitted.
		L = findInnermostNonInvariantLoop(L, VL);
		if (L) {
		SmallVector<const Loop *> NewLoopNest(getLoopNest(L));
		unsigned CommonLen = 0;
		for (const auto [L1, L2] : zip(CurrentLoopNest, NewLoopNest)) {
		if (L1 != L2) {
		LLVM_DEBUG(dbgs() << "SLP: Different loop nest.\n");
		if (L1 != L2)
		break;
		++CommonLen;
		}
		if (CurrentLoopNest.empty()) {
		CurrentLoopNest.assign(NewLoopNest);
		} else if (CommonLen < CurrentLoopNest.size() &&
		CommonLen < NewLoopNest.size()) {
		// Divergence below the common prefix: the tree now spans sibling
		// loops at depth CommonLen. Admitting them into one tree makes
		// the profitability decision JOINT across both siblings, so a
		// very hot sibling could otherwise let an unprofitable cold
		// sibling ride along "for free" (per-entry scaling of the cold
		// sibling's entries would be dwarfed by the hot one). Require
		// SCEV-proven equal backedge-taken counts for the diverging
		// siblings before joining; otherwise force gather.
		const Loop *SibA = CurrentLoopNest[CommonLen];
		const Loop *SibB = NewLoopNest[CommonLen];
		const SCEV *BecA = SE->getBackedgeTakenCount(SibA);
		const SCEV *BecB = SE->getBackedgeTakenCount(SibB);
		if (isa<SCEVCouldNotCompute>(BecA) \|\| BecA != BecB) {
		LLVM_DEBUG(dbgs()
		<< "SLP: Sibling loops have different trip counts.\n");
		newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
		return;
		}
		}
		if (NewLoopNest.size() > CurrentLoopNest.size())
		CurrentLoopNest.append(std::next(NewLoopNest.begin(), CurrentLoopNest.size()),
		CurrentLoopNest.truncate(CommonLen);
		} else if (NewLoopNest.size() > CurrentLoopNest.size()) {
		// New entry lives deeper in the same nest chain; extend.
		CurrentLoopNest.append(
		std::next(NewLoopNest.begin(), CurrentLoopNest.size()),
		NewLoopNest.end());
		}
		// Otherwise NewLoopNest is a prefix of CurrentLoopNest: keep as-is.
		}
		}
		}

		@@ -16069,7 +16133,7 @@ static unsigned getLoopTripCount(const Loop *L, ScalarEvolution &SE) {
		return LoopAwareTripCount;
		}

		unsigned BoUpSLP::getScaleToLoopIterations(const TreeEntry &TE, Value *Scalar,
		uint64_t BoUpSLP::getScaleToLoopIterations(const TreeEntry &TE, Value *Scalar,
		Instruction *U) {
		BasicBlock *Parent = nullptr;
		if (U) {
		@@ -16096,31 +16160,90 @@ unsigned BoUpSLP::getScaleToLoopIterations(const TreeEntry &TE, Value *Scalar,
		} else {
		Parent = TE.getMainOp()->getParent();
		}
		if (const Loop *L = LI->getLoopFor(Parent)) {
		const auto It = LoopToScaleFactor.find(L);
		if (It != LoopToScaleFactor.end())
		const Loop *L = LI->getLoopFor(Parent);
		if (!L)
		return 1;
		// The entry's cost is paid once per execution of the innermost loop in
		// which some of its operands are variant. Operands that are invariant in
		// all enclosing loops are executed once (LICM will hoist them out).
		return getLoopNestScale(findInnermostNonInvariantLoop(
		L, Scalar ? ArrayRef(Scalar) : ArrayRef(TE.Scalars)));
		}

		uint64_t BoUpSLP::getLoopNestScale(const Loop *L) {
		if (!L \|\| LoopAwareTripCount == 0)
		return 1;
		if (auto It = LoopNestScaleCache.find(L); It != LoopNestScaleCache.end())
		return It->second;
		unsigned Scale = 1;
		if (const Loop *NonInvL = findInnermostNonInvariantLoop(
		L, Scalar ? ArrayRef(Scalar) : ArrayRef(TE.Scalars))) {
		Scale = getLoopTripCount(NonInvL, *SE);
		for (const Loop *LN : getLoopNest(NonInvL)) {
		if (LN == L)
		// Collect loops from L outward up to (but not including) the first cached
		// ancestor or the function top, then walk back inward multiplying trip
		// counts. Use uint64_t to avoid silent overflow on deep/large nests.
		SmallVector<const Loop *> Chain;
		for (const Loop *Cur = L; Cur; Cur = Cur->getParentLoop()) {
		if (LoopNestScaleCache.contains(Cur))
		break;
		auto LNRes = LoopToScaleFactor.try_emplace(LN, 0);
		auto &LoopScale = LNRes.first->getSecond();
		if (!LNRes.second) {
		Scale *= LoopScale;
		break;
		}
		Scale = getLoopTripCount(LN, SE);
		LoopScale = Scale;
		}
		}
		LoopToScaleFactor.try_emplace(L, Scale);
		return Scale;
		Chain.push_back(Cur);
		}
		assert(!Chain.empty() && "Early-return above should have handled cache hit.");
		uint64_t Scale = 1;
		if (const Loop *Parent = Chain.back()->getParentLoop())
		Scale = LoopNestScaleCache.lookup(Parent);
		// Walk from the outermost uncached loop inward, accumulating trip counts.
		// Use SaturatingMultiply to clamp at uint64_t max on deep/large nests
		// rather than wrapping around.
		for (const Loop *Cur : reverse(Chain)) {
		uint64_t TC = std::max<uint64_t>(1, getLoopTripCount(Cur, *SE));
		Scale = SaturatingMultiply(Scale, TC);
		LoopNestScaleCache.try_emplace(Cur, std::max<uint64_t>(1, Scale));
		}
		return std::max<uint64_t>(1, Scale);
		}

		uint64_t BoUpSLP::getGatherNodeEffectiveScale(const TreeEntry &TE) {
		// Only meaningful for gather/buildvector-like entries; the per-lane
		// insertelements that make up such an entry are LICM-hoistable by
		// optimizeGatherSequence() when their operand is loop-invariant.
		assert((TE.isGather() \|\| TE.State == TreeEntry::SplitVectorize) &&
		"Expected gather/split tree entry.");

		uint64_t BaseScale = getScaleToLoopIterations(TE);
		if (!PerLaneGatherScale \|\| LoopAwareTripCount == 0 \|\| BaseScale <= 1)
		return BaseScale;

		// Average the per-lane execution scales: for each lane, reuse the same
		// scale helper the rest of the cost model uses, but ask it about that
		// one lane's value. Lanes that are loop-invariant in the current nest
		// collapse to their outer-loop scale (or 1 for fully invariant/constant
		// lanes), which matches the LICM hoisting performed by
		// optimizeGatherSequence(). Cap per-lane contributions by BaseScale so a
		// refinement can never raise the cost above the whole-entry scale.
		// Each lane contributes at most BaseScale, so Sum is bounded above by
		// N * BaseScale. If BaseScale is near uint64_t max (saturated by
		// getLoopNestScale on a deep nest) Sum can still overflow uint64_t,
		// which would silently wrap and produce a wrong average. Use
		// SaturatingAdd and bail out to BaseScale on overflow: the true average
		// is bounded above by BaseScale anyway, so this preserves the
		// refinement's invariant that it can never raise cost.
		uint64_t Sum = 0;
		unsigned N = 0;
		bool Overflow = false;
		for (Value *V : TE.Scalars) {
		if (isConstant(V))
		continue;
		++N;
		uint64_t LaneScale = std::min(getScaleToLoopIterations(TE, V), BaseScale);
		Sum = SaturatingAdd(Sum, LaneScale, &Overflow);
		if (Overflow)
		return BaseScale;
		}
		return 1;
		if (N == 0)
		return BaseScale;
		// Ceil-divide so we never round the effective scale down below 1.
		uint64_t Numerator = SaturatingAdd(Sum, uint64_t(N - 1), &Overflow);
		if (Overflow)
		return BaseScale;
		uint64_t Avg = Numerator / N;
		return std::clamp<uint64_t>(Avg, 1, BaseScale);
		}

		InstructionCost
		@@ -18025,7 +18148,7 @@ InstructionCost BoUpSLP::getSpillCost() {
		if (It != MinBWs.end())
		ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first);
		auto *VecTy = getWidenedType(ScalarTy, Op->getVectorFactor());
		unsigned Scale = getScaleToLoopIterations(*Op);
		uint64_t Scale = getScaleToLoopIterations(*Op);
		InstructionCost KeepLiveCost = TTI->getCostOfKeepingLiveOverCall(VecTy);
		KeepLiveCost *= Scale;
		Cost += KeepLiveCost;
		@@ -18462,8 +18585,8 @@ InstructionCost BoUpSLP::calculateTreeCostAndTrimNonProfitable(
		};
		constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
		InstructionCost Cost = 0;
		SmallDenseMap<const TreeEntry *, unsigned> EntryToScale;
		unsigned PrevScale = 0;
		SmallDenseMap<const TreeEntry *, uint64_t> EntryToScale;
		uint64_t PrevScale = 0;
		BasicBlock *PrevVecParent = nullptr;
		for (const std::unique_ptr<TreeEntry> &Ptr : VectorizableTree) {
		TreeEntry &TE = *Ptr;
		@@ -18498,8 +18621,14 @@ InstructionCost BoUpSLP::calculateTreeCostAndTrimNonProfitable(
		"Expected gather nodes with users only.");

		InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
		unsigned Scale = 0;
		uint64_t Scale = 0;
		bool CostIsFree = C == 0;
		// For gather/buildvector (and split-vectorize) entries, prefer the
		// per-lane refined scale that accounts for LICM-hoistable insertelements
		// when an operand is invariant in the current loop nest but defined in
		// an outer loop. This prevents over-costing cross-loop-nest buildvectors.
		const bool IsGatherLike =
		TE.isGather() \|\| TE.State == TreeEntry::SplitVectorize;
		if (!CostIsFree && !TE.isGather() && TE.hasState()) {
		if (PrevVecParent == TE.getMainOp()->getParent()) {
		Scale = PrevScale;
		@@ -18508,7 +18637,8 @@ InstructionCost BoUpSLP::calculateTreeCostAndTrimNonProfitable(
		}
		}
		if (!CostIsFree && !Scale) {
		Scale = getScaleToLoopIterations(TE);
		Scale = IsGatherLike ? getGatherNodeEffectiveScale(TE)
		: getScaleToLoopIterations(TE);
		C *= Scale;
		EntryToScale.try_emplace(&TE, Scale);
		if (!TE.isGather() && TE.hasState()) {
		@@ -18872,9 +19002,13 @@ InstructionCost BoUpSLP::calculateTreeCostAndTrimNonProfitable(
		NodesCosts.try_emplace(TE.get(), C);
		continue;
		}
		unsigned Scale = EntryToScale.lookup(TE.get());
		if (!Scale)
		Scale = getScaleToLoopIterations(*TE.get());
		uint64_t Scale = EntryToScale.lookup(TE.get());
		if (!Scale) {
		const bool IsGatherLike =
		TE->isGather() \|\| TE->State == TreeEntry::SplitVectorize;
		Scale = IsGatherLike ? getGatherNodeEffectiveScale(*TE.get())
		: getScaleToLoopIterations(*TE.get());
		}
		C *= Scale;
		NodesCosts.try_emplace(TE.get(), C);
		}
		@@ -18952,13 +19086,13 @@ InstructionCost BoUpSLP::getTreeCost(InstructionCost TreeCost,
		}
		InstructionCost Cost = TreeCost;

		SmallDenseMap<std::tuple<const TreeEntry , Value , Instruction *>, unsigned>
		SmallDenseMap<std::tuple<const TreeEntry , Value , Instruction *>, uint64_t>
		EntryToScale;
		auto ScaleCost = [&](InstructionCost C, const TreeEntry &TE,
		Value Scalar = nullptr, Instruction U = nullptr) {
		if (!C.isValid() \|\| C == 0)
		return C;
		unsigned &Scale =
		uint64_t &Scale =
		EntryToScale.try_emplace(std::make_tuple(&TE, Scalar, U), 0)
		.first->getSecond();
		if (!Scale)

llvm/test/Transforms/SLPVectorizer/AArch64/externally-used-copyables.ll

+122 −151

File changed.

Preview size limit exceeded, changes collapsed.

llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll

+7 −4

Original line number	Diff line number	Diff line
		@@ -36,7 +36,7 @@
		; YAML-NEXT: Function: getelementptr_4x32
		; YAML-NEXT: Args:
		; YAML-NEXT: - String: 'SLP vectorized with cost '
		; YAML-NEXT: - Cost: '12'
		; YAML-NEXT: - Cost: '10'
		; YAML-NEXT: - String: ' and with tree size '
		; YAML-NEXT: - TreeSize: '3'

		@@ -47,6 +47,8 @@ define i32 @getelementptr_4x32(ptr nocapture readonly %g, i32 %n, i32 %x, i32 %y
		; CHECK-NEXT: br i1 [[CMP31]], label [[FOR_BODY_PREHEADER:%.]], label [[FOR_COND_CLEANUP:%.]]
		; CHECK: for.body.preheader:
		; CHECK-NEXT: [[TMP0:%.]] = insertelement <2 x i32> <i32 0, i32 poison>, i32 [[X:%.]], i32 1
		; CHECK-NEXT: [[TMP4:%.]] = insertelement <2 x i32> poison, i32 [[Y:%.]], i32 0
		; CHECK-NEXT: [[TMP5:%.]] = insertelement <2 x i32> [[TMP4]], i32 [[Z:%.]], i32 1
		; CHECK-NEXT: br label [[FOR_BODY:%.*]]
		; CHECK: for.cond.cleanup.loopexit:
		; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
		@@ -66,10 +68,11 @@ define i32 @getelementptr_4x32(ptr nocapture readonly %g, i32 %n, i32 %x, i32 %y
		; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i32> [[TMP3]], i32 1
		; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[G]], i32 [[TMP11]]
		; CHECK-NEXT: [[T8:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
		; CHECK-NEXT: [[TMP13:%.]] = add nsw i32 [[T4]], [[Y:%.]]
		; CHECK-NEXT: [[TMP16:%.*]] = add nsw <2 x i32> [[TMP2]], [[TMP5]]
		; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i32> [[TMP16]], i32 0
		; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, ptr [[G]], i32 [[TMP13]]
		; CHECK-NEXT: [[T10:%.*]] = load i32, ptr [[ARRAYIDX10]], align 4
		; CHECK-NEXT: [[TMP14:%.]] = add nsw i32 [[T4]], [[Z:%.]]
		; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i32> [[TMP16]], i32 1
		; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[G]], i32 [[TMP14]]
		; CHECK-NEXT: [[T12:%.*]] = load i32, ptr [[ARRAYIDX15]], align 4
		; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[T6]], i32 0
		@@ -128,7 +131,7 @@ for.body:
		; YAML: Function: getelementptr_2x32
		; YAML: Args:
		; YAML: - String: 'SLP vectorized with cost '
		; YAML: - Cost: '12'
		; YAML: - Cost: '10'
		; YAML-NEXT: - String: ' and with tree size '
		; YAML-NEXT: - TreeSize: '3'

llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll

+9 −10

Original line number	Diff line number	Diff line
		@@ -207,22 +207,21 @@ define float @slp_not_profitable_in_loop(float %x, ptr %A) {
		; CHECK-LABEL: @slp_not_profitable_in_loop(
		; CHECK-NEXT: entry:
		; CHECK-NEXT: [[GEP_A_1:%.]] = getelementptr inbounds float, ptr [[A:%.]], i64 1
		; CHECK-NEXT: [[L_0:%.*]] = load float, ptr [[GEP_A_1]], align 4
		; CHECK-NEXT: [[A1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 2
		; CHECK-NEXT: [[L_2:%.*]] = load float, ptr [[A1]], align 4
		; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[GEP_A_1]], align 4
		; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x float> [[TMP0]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
		; CHECK-NEXT: [[L_3:%.*]] = load float, ptr [[A]], align 4
		; CHECK-NEXT: [[L_4:%.*]] = load float, ptr [[A]], align 4
		; CHECK-NEXT: [[TMP2:%.]] = insertelement <4 x float> <float 3.000000e+00, float 3.000000e+00, float poison, float 3.000000e+00>, float [[X:%.]], i32 2
		; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> poison, float [[L_3]], i32 2
		; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[L_4]], i32 3
		; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
		; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
		; CHECK-NEXT: br label [[LOOP:%.*]]
		; CHECK: loop:
		; CHECK-NEXT: [[IV:%.]] = phi i64 [ 0, [[ENTRY:%.]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
		; CHECK-NEXT: [[RED:%.]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[RED_NEXT:%.]], [[LOOP]] ]
		; CHECK-NEXT: [[TMP3:%.*]] = fmul fast float 3.000000e+00, [[L_0]]
		; CHECK-NEXT: [[MUL12:%.*]] = fmul fast float 3.000000e+00, [[L_2]]
		; CHECK-NEXT: [[TMP4:%.]] = fmul fast float [[X:%.]], [[L_3]]
		; CHECK-NEXT: [[MUL16:%.*]] = fmul fast float 3.000000e+00, [[L_4]]
		; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[MUL12]], [[TMP3]]
		; CHECK-NEXT: [[ADD13:%.*]] = fadd fast float [[ADD]], [[TMP4]]
		; CHECK-NEXT: [[RED_NEXT]] = fadd fast float [[ADD13]], [[MUL16]]
		; CHECK-NEXT: [[TMP7:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP6]]
		; CHECK-NEXT: [[RED_NEXT]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP7]])
		; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
		; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV]], 10
		; CHECK-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]]

llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll

+1 −1

Original line number	Diff line number	Diff line
		@@ -73,7 +73,7 @@ define void @fun1(double %0) {
		; REMARK-LABEL: Function: fun1
		; REMARK: Args:
		; REMARK: - String: 'SLP vectorized with cost '
		; REMARK-NEXT: - Cost: '-2'
		; REMARK-NEXT: - Cost: '-3'

		br label %2