[SLP] Look-ahead operand reordering heuristic. (6a18a954) · Commits · llvm-doe / llvm-project

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

+260 −46

Original line number	Diff line number	Diff line
		@@ -147,6 +147,20 @@ static cl::opt<unsigned> MinTreeSize(
		"slp-min-tree-size", cl::init(3), cl::Hidden,
		cl::desc("Only vectorize small trees if they are fully vectorizable"));

		// The maximum depth that the look-ahead score heuristic will explore.
		// The higher this value, the higher the compilation time overhead.
		static cl::opt<int> LookAheadMaxDepth(
		"slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
		cl::desc("The maximum look-ahead depth for operand reordering scores"));

		// The Look-ahead heuristic goes through the users of the bundle to calculate
		// the users cost in getExternalUsesCost(). To avoid compilation time increase
		// we limit the number of users visited to this value.
		static cl::opt<unsigned> LookAheadUsersBudget(
		"slp-look-ahead-users-budget", cl::init(2), cl::Hidden,
		cl::desc("The maximum number of users to visit while visiting the "
		"predecessors. This prevents compilation time increase."));

		static cl::opt<bool>
		ViewSLPTree("view-slp-tree", cl::Hidden,
		cl::desc("Display the SLP trees with Graphviz"));
		@@ -721,6 +735,7 @@ public:

		const DataLayout &DL;
		ScalarEvolution &SE;
		const BoUpSLP &R;

		/// \returns the operand data at \p OpIdx and \p Lane.
		OperandData &getData(unsigned OpIdx, unsigned Lane) {
		@@ -746,6 +761,227 @@ public:
		std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
		}

		// The hard-coded scores listed here are not very important. When computing
		// the scores of matching one sub-tree with another, we are basically
		// counting the number of values that are matching. So even if all scores
		// are set to 1, we would still get a decent matching result.
		// However, sometimes we have to break ties. For example we may have to
		// choose between matching loads vs matching opcodes. This is what these
		// scores are helping us with: they provide the order of preference.

		/// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
		static const int ScoreConsecutiveLoads = 3;
		/// ExtractElementInst from same vector and consecutive indexes.
		static const int ScoreConsecutiveExtracts = 3;
		/// Constants.
		static const int ScoreConstants = 2;
		/// Instructions with the same opcode.
		static const int ScoreSameOpcode = 2;
		/// Instructions with alt opcodes (e.g, add + sub).
		static const int ScoreAltOpcodes = 1;
		/// Identical instructions (a.k.a. splat or broadcast).
		static const int ScoreSplat = 1;
		/// Matching with an undef is preferable to failing.
		static const int ScoreUndef = 1;
		/// Score for failing to find a decent match.
		static const int ScoreFail = 0;
		/// User exteranl to the vectorized code.
		static const int ExternalUseCost = 1;
		/// The user is internal but in a different lane.
		static const int UserInDiffLaneCost = ExternalUseCost;

		/// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
		static int getShallowScore(Value V1, Value V2, const DataLayout &DL,
		ScalarEvolution &SE) {
		auto *LI1 = dyn_cast<LoadInst>(V1);
		auto *LI2 = dyn_cast<LoadInst>(V2);
		if (LI1 && LI2)
		return isConsecutiveAccess(LI1, LI2, DL, SE)
		? VLOperands::ScoreConsecutiveLoads
		: VLOperands::ScoreFail;

		auto *C1 = dyn_cast<Constant>(V1);
		auto *C2 = dyn_cast<Constant>(V2);
		if (C1 && C2)
		return VLOperands::ScoreConstants;

		// Extracts from consecutive indexes of the same vector better score as
		// the extracts could be optimized away.
		auto *Ex1 = dyn_cast<ExtractElementInst>(V1);
		auto *Ex2 = dyn_cast<ExtractElementInst>(V2);
		if (Ex1 && Ex2 && Ex1->getVectorOperand() == Ex2->getVectorOperand() &&
		cast<ConstantInt>(Ex1->getIndexOperand())->getZExtValue() + 1 ==
		cast<ConstantInt>(Ex2->getIndexOperand())->getZExtValue()) {
		return VLOperands::ScoreConsecutiveExtracts;
		}

		auto *I1 = dyn_cast<Instruction>(V1);
		auto *I2 = dyn_cast<Instruction>(V2);
		if (I1 && I2) {
		if (I1 == I2)
		return VLOperands::ScoreSplat;
		InstructionsState S = getSameOpcode({I1, I2});
		// Note: Only consider instructions with <= 2 operands to avoid
		// complexity explosion.
		if (S.getOpcode() && S.MainOp->getNumOperands() <= 2)
		return S.isAltShuffle() ? VLOperands::ScoreAltOpcodes
		: VLOperands::ScoreSameOpcode;
		}

		if (isa<UndefValue>(V2))
		return VLOperands::ScoreUndef;

		return VLOperands::ScoreFail;
		}

		/// Holds the values and their lane that are taking part in the look-ahead
		/// score calculation. This is used in the external uses cost calculation.
		SmallDenseMap<Value *, int> InLookAheadValues;

		/// \Returns the additinal cost due to uses of \p LHS and \p RHS that are
		/// either external to the vectorized code, or require shuffling.
		int getExternalUsesCost(const std::pair<Value *, int> &LHS,
		const std::pair<Value *, int> &RHS) {
		int Cost = 0;
		SmallVector<std::pair<Value *, int>, 2> Values = {LHS, RHS};
		for (int Idx = 0, IdxE = Values.size(); Idx != IdxE; ++Idx) {
		Value *V = Values[Idx].first;
		// Calculate the absolute lane, using the minimum relative lane of LHS
		// and RHS as base and Idx as the offset.
		int Ln = std::min(LHS.second, RHS.second) + Idx;
		assert(Ln >= 0 && "Bad lane calculation");
		unsigned UsersBudget = LookAheadUsersBudget;
		for (User *U : V->users()) {
		if (const TreeEntry *UserTE = R.getTreeEntry(U)) {
		// The user is in the VectorizableTree. Check if we need to insert.
		auto It = llvm::find(UserTE->Scalars, U);
		assert(It != UserTE->Scalars.end() && "U is in UserTE");
		int UserLn = std::distance(UserTE->Scalars.begin(), It);
		assert(UserLn >= 0 && "Bad lane");
		if (UserLn != Ln)
		Cost += UserInDiffLaneCost;
		} else {
		// Check if the user is in the look-ahead code.
		auto It2 = InLookAheadValues.find(U);
		if (It2 != InLookAheadValues.end()) {
		// The user is in the look-ahead code. Check the lane.
		if (It2->second != Ln)
		Cost += UserInDiffLaneCost;
		} else {
		// The user is neither in SLP tree nor in the look-ahead code.
		Cost += ExternalUseCost;
		}
		}
		// Limit the number of visited uses to cap compilation time.
		if (--UsersBudget == 0)
		break;
		}
		}
		return Cost;
		}

		/// Go through the operands of \p LHS and \p RHS recursively until \p
		/// MaxLevel, and return the cummulative score. For example:
		/// \verbatim
		/// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
		/// \ / \ / \ / \ /
		/// + + + +
		/// G1 G2 G3 G4
		/// \endverbatim
		/// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
		/// each level recursively, accumulating the score. It starts from matching
		/// the additions at level 0, then moves on to the loads (level 1). The
		/// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
		/// {B[0],B[1]} match with VLOperands::ScoreConsecutiveLoads, while
		/// {A[0],C[0]} has a score of VLOperands::ScoreFail.
		/// Please note that the order of the operands does not matter, as we
		/// evaluate the score of all profitable combinations of operands. In
		/// other words the score of G1 and G4 is the same as G1 and G2. This
		/// heuristic is based on ideas described in:
		/// Look-ahead SLP: Auto-vectorization in the presence of commutative
		/// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
		/// Luís F. W. Góes
		int getScoreAtLevelRec(const std::pair<Value *, int> &LHS,
		const std::pair<Value *, int> &RHS, int CurrLevel,
		int MaxLevel) {

		Value *V1 = LHS.first;
		Value *V2 = RHS.first;
		// Get the shallow score of V1 and V2.
		int ShallowScoreAtThisLevel =
		std::max((int)ScoreFail, getShallowScore(V1, V2, DL, SE) -
		getExternalUsesCost(LHS, RHS));
		int Lane1 = LHS.second;
		int Lane2 = RHS.second;

		// If reached MaxLevel,
		// or if V1 and V2 are not instructions,
		// or if they are SPLAT,
		// or if they are not consecutive, early return the current cost.
		auto *I1 = dyn_cast<Instruction>(V1);
		auto *I2 = dyn_cast<Instruction>(V2);
		if (CurrLevel == MaxLevel \|\| !(I1 && I2) \|\| I1 == I2 \|\|
		ShallowScoreAtThisLevel == VLOperands::ScoreFail \|\|
		(isa<LoadInst>(I1) && isa<LoadInst>(I2) && ShallowScoreAtThisLevel))
		return ShallowScoreAtThisLevel;
		assert(I1 && I2 && "Should have early exited.");

		// Keep track of in-tree values for determining the external-use cost.
		InLookAheadValues[V1] = Lane1;
		InLookAheadValues[V2] = Lane2;

		// Contains the I2 operand indexes that got matched with I1 operands.
		SmallSet<unsigned, 4> Op2Used;

		// Recursion towards the operands of I1 and I2. We are trying all possbile
		// operand pairs, and keeping track of the best score.
		for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
		OpIdx1 != NumOperands1; ++OpIdx1) {
		// Try to pair op1I with the best operand of I2.
		int MaxTmpScore = 0;
		unsigned MaxOpIdx2 = 0;
		bool FoundBest = false;
		// If I2 is commutative try all combinations.
		unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
		unsigned ToIdx = isCommutative(I2)
		? I2->getNumOperands()
		: std::min(I2->getNumOperands(), OpIdx1 + 1);
		assert(FromIdx <= ToIdx && "Bad index");
		for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
		// Skip operands already paired with OpIdx1.
		if (Op2Used.count(OpIdx2))
		continue;
		// Recursively calculate the cost at each level
		int TmpScore = getScoreAtLevelRec({I1->getOperand(OpIdx1), Lane1},
		{I2->getOperand(OpIdx2), Lane2},
		CurrLevel + 1, MaxLevel);
		// Look for the best score.
		if (TmpScore > VLOperands::ScoreFail && TmpScore > MaxTmpScore) {
		MaxTmpScore = TmpScore;
		MaxOpIdx2 = OpIdx2;
		FoundBest = true;
		}
		}
		if (FoundBest) {
		// Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
		Op2Used.insert(MaxOpIdx2);
		ShallowScoreAtThisLevel += MaxTmpScore;
		}
		}
		return ShallowScoreAtThisLevel;
		}

		/// \Returns the look-ahead score, which tells us how much the sub-trees
		/// rooted at \p LHS and \p RHS match, the more they match the higher the
		/// score. This helps break ties in an informed way when we cannot decide on
		/// the order of the operands by just considering the immediate
		/// predecessors.
		int getLookAheadScore(const std::pair<Value *, int> &LHS,
		const std::pair<Value *, int> &RHS) {
		InLookAheadValues.clear();
		return getScoreAtLevelRec(LHS, RHS, 1, LookAheadMaxDepth);
		}

		// Search all operands in Ops[*][Lane] for the one that matches best
		// Ops[OpIdx][LastLane] and return its opreand index.
		// If no good match can be found, return None.
		@@ -763,9 +999,6 @@ public:
		// The linearized opcode of the operand at OpIdx, Lane.
		bool OpIdxAPO = getData(OpIdx, Lane).APO;

		const unsigned BestScore = 2;
		const unsigned GoodScore = 1;

		// The best operand index and its score.
		// Sometimes we have more than one option (e.g., Opcode and Undefs), so we
		// are using the score to differentiate between the two.
		@@ -794,41 +1027,19 @@ public:
		// Look for an operand that matches the current mode.
		switch (RMode) {
		case ReorderingMode::Load:
		if (isa<LoadInst>(Op)) {
		// Figure out which is left and right, so that we can check for
		// consecutive loads
		case ReorderingMode::Constant:
		case ReorderingMode::Opcode: {
		bool LeftToRight = Lane > LastLane;
		Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
		Value *OpRight = (LeftToRight) ? Op : OpLastLane;
		if (isConsecutiveAccess(cast<LoadInst>(OpLeft),
		cast<LoadInst>(OpRight), DL, SE))
		BestOp.Idx = Idx;
		}
		break;
		case ReorderingMode::Opcode:
		// We accept both Instructions and Undefs, but with different scores.
		if ((isa<Instruction>(Op) && isa<Instruction>(OpLastLane) &&
		cast<Instruction>(Op)->getOpcode() ==
		cast<Instruction>(OpLastLane)->getOpcode()) \|\|
		(isa<UndefValue>(OpLastLane) && isa<Instruction>(Op)) \|\|
		isa<UndefValue>(Op)) {
		// An instruction has a higher score than an undef.
		unsigned Score = (isa<UndefValue>(Op)) ? GoodScore : BestScore;
		unsigned Score =
		getLookAheadScore({OpLeft, LastLane}, {OpRight, Lane});
		if (Score > BestOp.Score) {
		BestOp.Idx = Idx;
		BestOp.Score = Score;
		}
		}
		break;
		case ReorderingMode::Constant:
		if (isa<Constant>(Op)) {
		unsigned Score = (isa<UndefValue>(Op)) ? GoodScore : BestScore;
		if (Score > BestOp.Score) {
		BestOp.Idx = Idx;
		BestOp.Score = Score;
		}
		}
		break;
		case ReorderingMode::Splat:
		if (Op == OpLastLane)
		BestOp.Idx = Idx;
		@@ -959,8 +1170,8 @@ public:
		public:
		/// Initialize with all the operands of the instruction vector \p RootVL.
		VLOperands(ArrayRef<Value *> RootVL, const DataLayout &DL,
		ScalarEvolution &SE)
		: DL(DL), SE(SE) {
		ScalarEvolution &SE, const BoUpSLP &R)
		: DL(DL), SE(SE), R(R) {
		// Append all the operands of RootVL.
		appendOperandsOfVL(RootVL);
		}
		@@ -1189,7 +1400,8 @@ private:
		SmallVectorImpl<Value *> &Left,
		SmallVectorImpl<Value *> &Right,
		const DataLayout &DL,
		ScalarEvolution &SE);
		ScalarEvolution &SE,
		const BoUpSLP &R);
		struct TreeEntry {
		using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
		TreeEntry(VecTreeTy &Container) : Container(Container) {}
		@@ -2550,7 +2762,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
		// Commutative predicate - collect + sort operands of the instructions
		// so that each side is more likely to have the same opcode.
		assert(P0 == SwapP0 && "Commutative Predicate mismatch");
		reorderInputsAccordingToOpcode(VL, Left, Right, DL, SE);
		reorderInputsAccordingToOpcode(VL, Left, Right, DL, SE, *this);
		} else {
		// Collect operands - commute if it uses the swapped predicate.
		for (Value *V : VL) {
		@@ -2597,7 +2809,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
		// have the same opcode.
		if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
		ValueList Left, Right;
		reorderInputsAccordingToOpcode(VL, Left, Right, DL, SE);
		reorderInputsAccordingToOpcode(VL, Left, Right, DL, SE, *this);
		TE->setOperand(0, Left);
		TE->setOperand(1, Right);
		buildTree_rec(Left, Depth + 1, {TE, 0});
		@@ -2789,7 +3001,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
		// Reorder operands if reordering would enable vectorization.
		if (isa<BinaryOperator>(VL0)) {
		ValueList Left, Right;
		reorderInputsAccordingToOpcode(VL, Left, Right, DL, SE);
		reorderInputsAccordingToOpcode(VL, Left, Right, DL, SE, *this);
		TE->setOperand(0, Left);
		TE->setOperand(1, Right);
		buildTree_rec(Left, Depth + 1, {TE, 0});
		@@ -3550,13 +3762,15 @@ int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const {

		// Perform operand reordering on the instructions in VL and return the reordered
		// operands in Left and Right.
		void BoUpSLP::reorderInputsAccordingToOpcode(
		ArrayRef<Value > VL, SmallVectorImpl<Value > &Left,
		SmallVectorImpl<Value *> &Right, const DataLayout &DL,
		ScalarEvolution &SE) {
		void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
		SmallVectorImpl<Value *> &Left,
		SmallVectorImpl<Value *> &Right,
		const DataLayout &DL,
		ScalarEvolution &SE,
		const BoUpSLP &R) {
		if (VL.empty())
		return;
		VLOperands Ops(VL, DL, SE);
		VLOperands Ops(VL, DL, SE, R);
		// Reorder the operands in place.
		Ops.reorder();
		Left = Ops.getVL(0);

llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll

+41 −58

File changed.

Preview size limit exceeded, changes collapsed.

llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll

+223 −33

File changed.

Preview size limit exceeded, changes collapsed.