[SLP]Vectorize operand chains of non-vectorizable instructions (783bf90e) · Commits · llvm-doe / llvm-project

llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h

+6 −0

Original line number	Diff line number	Diff line
		@@ -143,6 +143,12 @@ private:
		bool vectorizeCmpInsts(iterator_range<ItT> CmpInsts, BasicBlock *BB,
		slpvectorizer::BoUpSLP &R);

		/// Tries to vectorize the operand chains of the non-vectorizable
		/// instructions in \p Insts.
		template <typename ItT>
		bool vectorizeNonVectorizableInsts(iterator_range<ItT> Insts, BasicBlock *BB,
		slpvectorizer::BoUpSLP &R);

		/// Tries to vectorize constructs started from InsertValueInst or
		/// InsertElementInst instructions.
		bool vectorizeInserts(InstSetVector &Instructions, BasicBlock *BB,

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

+315 −15

Original line number	Diff line number	Diff line
		@@ -232,6 +232,15 @@ static cl::opt<bool> VectorizeNonPowerOf2(
		"slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
		cl::desc("Try to vectorize with non-power-of-2 number of elements."));

		static cl::opt<bool> ForcePostProcessStoresOperands(
		"slp-postprocess-stores-operands", cl::init(false), cl::Hidden,
		cl::desc("Force vectorization of non-vectorizable stores operands."));

		static cl::opt<bool> NonVectReductions(
		"slp-non-vectorizables-as-reductions", cl::init(false), cl::Hidden,
		cl::desc(
		"Use non-vectorizable instructions as potential reduction roots."));

		/// True when \p slp-vectorize-non-power-of-2 is enabled and \p NumElts is a
		/// supported non-power-of-2 width: \p NumElts + 1 must be a power of two
		/// (e.g. 3 or 7 lanes, i.e. almost a full power-of-2 register).
		@@ -7005,11 +7014,12 @@ getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind,
		/// This is similar to TargetTransformInfo::getScalarizationOverhead, but if
		/// ScalarTy is a FixedVectorType, a vector will be inserted or extracted
		/// instead of a scalar.
		static InstructionCost
		getScalarizationOverhead(const TargetTransformInfo &TTI, Type *ScalarTy,
		VectorType *Ty, const APInt &DemandedElts, bool Insert,
		bool Extract, TTI::TargetCostKind CostKind,
		bool ForPoisonSrc = true, ArrayRef<Value *> VL = {}) {
		static InstructionCost getScalarizationOverhead(
		const TargetTransformInfo &TTI, Type ScalarTy, VectorType Ty,
		const APInt &DemandedElts, bool Insert, bool Extract,
		TTI::TargetCostKind CostKind, bool ForPoisonSrc = true,
		ArrayRef<Value *> VL = {},
		TTI::VectorInstrContext VIC = TTI::VectorInstrContext::None) {
		assert(!isa<ScalableVectorType>(Ty) &&
		"ScalableVectorType is not supported.");
		assert(getNumElements(ScalarTy) * DemandedElts.getBitWidth() ==
		@@ -7034,7 +7044,7 @@ getScalarizationOverhead(const TargetTransformInfo &TTI, Type *ScalarTy,
		return Cost;
		}
		return TTI.getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
		CostKind, ForPoisonSrc, VL);
		CostKind, ForPoisonSrc, VL, VIC);
		}

		/// This is similar to TargetTransformInfo::getVectorInstrCost, but if ScalarTy
		@@ -30263,6 +30273,245 @@ bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,
		return Changed;
		}

		/// Returns true if \p I is an instruction whose result the SLP vectorizer
		/// cannot turn into a vector instruction directly, but whose operand chains
		/// may still be worth vectorizing as bundle seeds.
		static bool isNonVectorizableInst(const Instruction *I,
		const TargetLibraryInfo *TLI) {
		if (const auto *CB = dyn_cast<CallBase>(I)) {
		if (CB->isInlineAsm())
		return false;
		if (const auto *II = dyn_cast<IntrinsicInst>(CB)) {
		if (II->isAssumeLikeIntrinsic())
		return false;
		if (isa<AnyMemIntrinsic>(II))
		return false;
		}
		if (const auto *CI = dyn_cast<CallInst>(CB)) {
		Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
		if (isTriviallyVectorizable(ID))
		return false;
		if (!VFDatabase::getMappings(*CI).empty())
		return false;
		if (all_of(CI->args(), [](const Value *Arg) {
		return !isa<Instruction>(Arg) \|\| Arg->getType()->isPointerTy();
		}))
		return false;
		if (any_of(CI->args(), [](const Value *Arg) {
		return Arg->getType()->isPointerTy();
		}))
		return false;
		}
		// Skip vector-returning calls in non-revec mode - we cannot turn their
		// results into wider vectors here.
		return SLPReVec \|\| !CB->getType()->isVectorTy();
		}
		if (isa<AtomicRMWInst, AtomicCmpXchgInst>(I))
		return true;
		if (const auto *RI = dyn_cast<ReturnInst>(I))
		return RI->getNumOperands() > 0 &&
		(SLPReVec \|\| !I->getOperand(0)->getType()->isVectorTy()) &&
		isa<Instruction>(I->getOperand(0));
		return false;
		}

		/// Visits the value operands of \p I that are candidates for operand-chain
		/// vectorization.
		template <typename Func>
		static void forEachOperandChainCandidate(Instruction *I, Func F,
		bool ForReduction) {
		if (auto *CB = dyn_cast<CallBase>(I)) {
		if (ForReduction && !NonVectReductions && CB->arg_size() > 1)
		return;
		for (auto [Idx, U] : enumerate(CB->args()))
		F(U.get(), Idx);
		return;
		}
		if (auto *AI = dyn_cast<AtomicRMWInst>(I)) {
		F(AI->getValOperand(), 0);
		return;
		}
		if (auto *AI = dyn_cast<AtomicCmpXchgInst>(I)) {
		F(AI->getCompareOperand(), 0);
		F(AI->getNewValOperand(), 1);
		return;
		}
		if (ForReduction && !NonVectReductions)
		return;
		if (auto *SI = dyn_cast<StoreInst>(I)) {
		F(SI->getValueOperand(), 0);
		return;
		}
		if (auto *RI = dyn_cast<ReturnInst>(I)) {
		if (RI->getNumOperands() > 0)
		F(RI->getReturnValue(), 0);
		return;
		}
		llvm_unreachable("Unexpected instruction kind for operand-chain seeding");
		}

		template <typename ItT>
		bool SLPVectorizerPass::vectorizeNonVectorizableInsts(
		iterator_range<ItT> InstRange, BasicBlock *BB, BoUpSLP &R) {
		SmallVector<Instruction *> Insts(InstRange);
		if (Insts.empty())
		return false;
		stable_sort(Insts, [](const Instruction A, const Instruction B) {
		return A->getOpcode() < B->getOpcode();
		});

		bool Changed = false;
		// Pass 1 - try to find horizontal reductions feeding the root operands.
		SmallPtrSet<Value *, 8> RootSeen;
		for (Instruction *I : Insts) {
		if (R.isDeleted(I))
		continue;
		bool RootDeleted = false;
		forEachOperandChainCandidate(
		I,
		[&](Value Op, unsigned /Position*/) {
		if (RootDeleted)
		return;
		auto *RootOp = dyn_cast<Instruction>(Op);
		if (!RootOp \|\| RootOp->getParent() != BB \|\| R.isDeleted(RootOp) \|\|
		isa<ShuffleVectorInst>(RootOp) \|\|
		!isValidElementType(RootOp->getType()))
		return;
		if (!RootSeen.insert(RootOp).second)
		return;
		Changed \|= vectorizeRootInstruction(nullptr, RootOp, BB, R);
		if (R.isDeleted(I))
		RootDeleted = true;
		},
		/ForReduction=/true);
		}
		// Pass 2 - collect the operand instructions across all roots and try to
		// vectorize them as bundles.
		if (Insts.size() < 2)
		return Changed;
		struct OperandGroupKey {
		enum class Kind : unsigned {
		NonCall = 0,
		Intrinsic,
		NamedFunction,
		IndirectCall,
		};
		Kind RootKind;
		unsigned KindID; // Intrinsic ID for Intrinsic, opcode for NonCall,
		// callee value-kind for IndirectCall, 0 for
		// NamedFunction.
		unsigned SubOp; // AtomicRMW operation; 0 otherwise.
		StringRef FuncName; // Non-empty only for NamedFunction.
		unsigned Position; // Operand slot within the root.

		bool operator==(const OperandGroupKey &O) const {
		return RootKind == O.RootKind && KindID == O.KindID && SubOp == O.SubOp &&
		FuncName == O.FuncName && Position == O.Position;
		}
		bool operator!=(const OperandGroupKey &O) const { return !(*this == O); }
		bool less(const OperandGroupKey &O) const {
		if (RootKind != O.RootKind)
		return static_cast<unsigned>(RootKind) <
		static_cast<unsigned>(O.RootKind);
		if (KindID != O.KindID)
		return KindID < O.KindID;
		if (SubOp != O.SubOp)
		return SubOp < O.SubOp;
		if (int C = FuncName.compare(O.FuncName))
		return C < 0;
		return Position < O.Position;
		}
		};
		SmallDenseMap<Value *, OperandGroupKey> OpKeys;

		auto BuildKey = [](Instruction *I, unsigned Position) -> OperandGroupKey {
		if (auto *CB = dyn_cast<CallBase>(I)) {
		if (auto *II = dyn_cast<IntrinsicInst>(CB))
		return {OperandGroupKey::Kind::Intrinsic,
		II->getIntrinsicID(),
		0,
		{},
		Position};
		if (Function *F = CB->getCalledFunction())
		return {OperandGroupKey::Kind::NamedFunction, 0, 0, F->getName(),
		Position};
		return {OperandGroupKey::Kind::IndirectCall,
		CB->getCalledOperand()->getValueID(),
		0,
		{},
		Position};
		}
		unsigned SubOp = 0;
		if (auto *AI = dyn_cast<AtomicRMWInst>(I))
		SubOp = static_cast<unsigned>(AI->getOperation());
		return {
		OperandGroupKey::Kind::NonCall, I->getOpcode(), SubOp, {}, Position};
		};

		auto OperandSorter = [&OpKeys](Value V1, Value V2) -> bool {
		if (V1 == V2)
		return false;
		const OperandGroupKey &K1 = OpKeys.at(V1);
		const OperandGroupKey &K2 = OpKeys.at(V2);
		if (K1 != K2)
		return K1.less(K2);
		auto *I1 = cast<Instruction>(V1);
		auto *I2 = cast<Instruction>(V2);
		if (I1->getType()->getTypeID() != I2->getType()->getTypeID())
		return I1->getType()->getTypeID() < I2->getType()->getTypeID();
		if (I1->getType()->getScalarSizeInBits() !=
		I2->getType()->getScalarSizeInBits())
		return I1->getType()->getScalarSizeInBits() <
		I2->getType()->getScalarSizeInBits();
		if (I1->getOpcode() != I2->getOpcode())
		return I1->getOpcode() < I2->getOpcode();
		return I1->comesBefore(I2);
		};

		auto AreCompatibleOperands = [&OpKeys](ArrayRef<Value *> VL,
		Value *V) -> bool {
		if (VL.empty() \|\| VL.back() == V)
		return true;
		const OperandGroupKey &KBack = OpKeys.at(VL.back());
		const OperandGroupKey &K = OpKeys.at(V);
		if (KBack != K)
		return false;
		auto *I1 = cast<Instruction>(VL.back());
		auto *I2 = cast<Instruction>(V);
		return I1->getType() == I2->getType() && I1->getOpcode() == I2->getOpcode();
		};

		SmallVector<Value *> Operands;
		SmallPtrSet<Value *, 8> Seen;
		for (Instruction *I : Insts) {
		if (R.isDeleted(I))
		continue;
		forEachOperandChainCandidate(
		I,
		[&](Value *Op, unsigned Position) {
		auto *OpI = dyn_cast<Instruction>(Op);
		if (!OpI \|\| OpI->getParent() != BB \|\| R.isDeleted(OpI) \|\|
		isa<ShuffleVectorInst>(OpI) \|\|
		!isValidElementType(OpI->getType()))
		return;
		if (!Seen.insert(OpI).second)
		return;
		OpKeys.try_emplace(OpI, BuildKey(I, Position));
		Operands.push_back(OpI);
		},
		/ForReduction=/false);
		}
		if (Operands.size() <= 1)
		return Changed;
		Changed \|= tryToVectorizeSequence<Value>(
		Operands, OperandSorter, AreCompatibleOperands,
		[this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
		return tryToVectorizeList(Candidates, R, MaxVFOnly);
		},
		/MaxVFOnly=/true, R);
		return Changed;
		}

		bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
		BasicBlock *BB, BoUpSLP &R) {
		assert(all_of(Instructions, IsaPred<InsertElementInst, InsertValueInst>) &&
		@@ -30535,21 +30784,33 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {

		InstSetVector PostProcessInserts;
		SmallSetVector<CmpInst *, 8> PostProcessCmps;
		// Vectorizes Inserts in `PostProcessInserts` and if `VectorizeCmps` is true
		// also vectorizes `PostProcessCmps`.
		auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) {
		// Non-vectorizable root instructions other than stores: calls
		// (regular and intrinsic), invokes, callbrs, atomic RMW/cmpxchg, and
		// returns.
		SmallSetVector<Instruction *, 8> PostProcessInsts;
		// Stores are processed after all other instructions/roots.
		SmallSetVector<StoreInst *, 8> PostProcessStores;
		auto VectorizeInsertsAndCmps = [&](bool AtTerminator) {
		bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
		if (VectorizeCmps) {
		if (AtTerminator) {
		Changed \|= vectorizeCmpInsts(reverse(PostProcessCmps), BB, R);
		PostProcessCmps.clear();
		if (!PostProcessInsts.empty())
		Changed \|=
		vectorizeNonVectorizableInsts(reverse(PostProcessInsts), BB, R);
		PostProcessInsts.clear();
		}
		PostProcessInserts.clear();
		return Changed;
		};
		// Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`.
		// Returns true if `I` is in any of the post-process sets.
		auto IsInPostProcessInstrs = [&](Instruction *I) {
		if (auto *Cmp = dyn_cast<CmpInst>(I))
		return PostProcessCmps.contains(Cmp);
		if (PostProcessInsts.contains(I))
		return true;
		if (auto *SI = dyn_cast<StoreInst>(I))
		return PostProcessStores.contains(SI);
		return isa<InsertElementInst, InsertValueInst>(I) &&
		PostProcessInserts.contains(I);
		};
		@@ -30572,7 +30833,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
		// We may go through BB multiple times so skip the one we have checked.
		if (!VisitedInstrs.insert(&*It).second) {
		if (HasNoUsers(&*It) &&
		VectorizeInsertsAndCmps(/VectorizeCmps=/It->isTerminator())) {
		VectorizeInsertsAndCmps(/AtTerminator=/It->isTerminator())) {
		// We would like to start over since some instructions are deleted
		// and the iterator may become invalid value.
		Changed = true;
		@@ -30652,7 +30913,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
		// top-tree instructions to try to vectorize as many instructions as
		// possible.
		OpsChanged \|=
		VectorizeInsertsAndCmps(/VectorizeCmps=/It->isTerminator());
		VectorizeInsertsAndCmps(/AtTerminator=/It->isTerminator());
		if (OpsChanged) {
		// We would like to start over since some instructions are deleted
		// and the iterator may become invalid value.
		@@ -30665,8 +30926,47 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {

		if (isa<InsertElementInst, InsertValueInst>(It))
		PostProcessInserts.insert(&*It);
		else if (isa<CmpInst>(It))
		PostProcessCmps.insert(cast<CmpInst>(&*It));
		else if (auto *CI = dyn_cast<CmpInst>(It))
		PostProcessCmps.insert(CI);
		else if (auto *SI = dyn_cast<StoreInst>(It);
		SI &&
		(SLPReVec \|\| !SI->getValueOperand()->getType()->isVectorTy()) &&
		isa<Instruction>(SI->getValueOperand()))
		PostProcessStores.insert(SI);
		else if (isNonVectorizableInst(&*It, TLI))
		PostProcessInsts.insert(&*It);
		}

		// Late post-process: run operand-chain vectorization for stores.
		if (!PostProcessStores.empty() &&
		(NonVectReductions \|\| PostProcessStores.size() >= 2)) {
		if (!ForcePostProcessStoresOperands && SLPCostThreshold >= 0) {
		// Use pessimistic cost estimation to avoid long compile time when there
		// are many stores in the list.
		Type *ScalarTy = getValueType(PostProcessStores.front());
		if (!::isValidElementType(ScalarTy))
		return Changed;
		if (!NonVectReductions && PostProcessStores.size() == 2 &&
		cast<Instruction>(PostProcessStores.front()->getValueOperand())
		->getOpcode() !=
		cast<Instruction>(PostProcessStores.back()->getValueOperand())
		->getOpcode())
		return Changed;
		ScalarTy =
		IntegerType::get(ScalarTy->getContext(),
		DL->getTypeSizeInBits(ScalarTy->getScalarType()));
		if (auto *ValTy = dyn_cast<VectorType>(
		PostProcessStores.front()->getValueOperand()->getType()))
		ScalarTy = ::getWidenedType(ScalarTy, getNumElements(ValTy));
		auto *VecTy = ::getWidenedType(ScalarTy, PostProcessStores.size());
		InstructionCost ExtractsCost = ::getScalarizationOverhead(
		*TTI, ScalarTy, VecTy, APInt::getAllOnes(PostProcessStores.size()),
		/Insert=/false, /Extract=/true, TTI::TCK_RecipThroughput,
		/ForPoisonSrc=/true, {}, TTI::VectorInstrContext::Store);
		if (ExtractsCost > PostProcessStores.size() + 1)
		return Changed;
		}
		Changed \|= vectorizeNonVectorizableInsts(reverse(PostProcessStores), BB, R);
		}

		return Changed;

llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll

+0 −2

Original line number	Diff line number	Diff line
		@@ -12,8 +12,6 @@ target triple = "aarch64--linux-gnu"
		; REMARK-NEXT: - String: 'Vectorized horizontal reduction with cost '
		; REMARK-NEXT: - Cost: '-8'
		;
		; REMARK-NOT: Function: gather_load

		define internal i32 @gather_multiple_use(i32 %a, i32 %b, i32 %c, i32 %d) {
		; CHECK-LABEL: @gather_multiple_use(
		; CHECK-NEXT: [[TMP1:%.]] = insertelement <4 x i32> poison, i32 [[C:%.]], i32 0

llvm/test/Transforms/SLPVectorizer/AMDGPU/notriviallyvectorizableintrinsicoperands.ll

+61 −98

File changed.

Preview size limit exceeded, changes collapsed.

llvm/test/Transforms/SLPVectorizer/RISCV/revec-strided-store.ll

+4 −3

Original line number	Diff line number	Diff line
		@@ -5,11 +5,12 @@
		define void @strided_load_and_store(ptr %in, ptr %out) {
		; CHECK-LABEL: @strided_load_and_store(
		; CHECK-NEXT: entry:
		; CHECK-NEXT: [[TMP0:%.]] = getelementptr i8, ptr [[IN:%.]], i64 16
		; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[IN]], align 2
		; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, ptr [[TMP0]], align 2
		; CHECK-NEXT: [[TMP0:%.]] = call <2 x i64> @llvm.experimental.vp.strided.load.v2i64.p0.i64(ptr align 2 [[IN:%.]], i64 16, <2 x i1> splat (i1 true), i32 2)
		; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8>
		; CHECK-NEXT: [[TMP3:%.]] = getelementptr i8, ptr [[OUT:%.]], i64 16
		; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[TMP4]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
		; CHECK-NEXT: store <8 x i8> [[TMP1]], ptr [[OUT]], align 2
		; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP4]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
		; CHECK-NEXT: store <8 x i8> [[TMP2]], ptr [[TMP3]], align 2
		; CHECK-NEXT: ret void
		;