[SLP] Vectorize jumbled stores. (21d498c9) · Commits · llvm-doe / llvm-project

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

+91 −16

Original line number	Diff line number	Diff line
		@@ -2666,24 +2666,74 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
		}
		case Instruction::Store: {
		// Check if the stores are consecutive or if we need to swizzle them.
		for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
		if (!isConsecutiveAccess(VL[i], VL[i + 1], DL, SE)) {
		llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
		// Make sure all stores in the bundle are simple - we can't vectorize
		// atomic or volatile stores.
		SmallVector<Value *, 4> PointerOps(VL.size());
		ValueList Operands(VL.size());
		auto POIter = PointerOps.begin();
		auto OIter = Operands.begin();
		for (Value *V : VL) {
		auto *SI = cast<StoreInst>(V);
		if (!SI->isSimple()) {
		BS.cancelScheduling(VL, VL0);
		newTreeEntry(VL, None /not vectorized/, S, UserTreeIdx,
		ReuseShuffleIndicies);
		LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
		LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
		return;
		}
		*POIter = SI->getPointerOperand();
		*OIter = SI->getValueOperand();
		++POIter;
		++OIter;
		}

		TreeEntry TE = newTreeEntry(VL, Bundle /vectorized*/, S, UserTreeIdx,
		ReuseShuffleIndicies);
		OrdersType CurrentOrder;
		// Check the order of pointer operands.
		if (llvm::sortPtrAccesses(PointerOps, DL, SE, CurrentOrder)) {
		Value *Ptr0;
		Value *PtrN;
		if (CurrentOrder.empty()) {
		Ptr0 = PointerOps.front();
		PtrN = PointerOps.back();
		} else {
		Ptr0 = PointerOps[CurrentOrder.front()];
		PtrN = PointerOps[CurrentOrder.back()];
		}
		const SCEV *Scev0 = SE->getSCEV(Ptr0);
		const SCEV *ScevN = SE->getSCEV(PtrN);
		const auto *Diff =
		dyn_cast<SCEVConstant>(SE->getMinusSCEV(ScevN, Scev0));
		uint64_t Size = DL->getTypeAllocSize(ScalarTy);
		// Check that the sorted pointer operands are consecutive.
		if (Diff && Diff->getAPInt() == (VL.size() - 1) * Size) {
		if (CurrentOrder.empty()) {
		// Original stores are consecutive and does not require reordering.
		++NumOpsWantToKeepOriginalOrder;
		TreeEntry TE = newTreeEntry(VL, Bundle /vectorized*/, S,
		UserTreeIdx, ReuseShuffleIndicies);
		TE->setOperandsInOrder();
		buildTree_rec(Operands, Depth + 1, {TE, 0});
		LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");

		ValueList Operands;
		for (Value *V : VL)
		Operands.push_back(cast<Instruction>(V)->getOperand(0));
		} else {
		// Need to reorder.
		auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
		++(I->getSecond());
		TreeEntry *TE =
		newTreeEntry(VL, Bundle /vectorized/, S, UserTreeIdx,
		ReuseShuffleIndicies, I->getFirst());
		TE->setOperandsInOrder();
		buildTree_rec(Operands, Depth + 1, {TE, 0});
		LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n");
		}
		return;
		}
		}

		BS.cancelScheduling(VL, VL0);
		newTreeEntry(VL, None /not vectorized/, S, UserTreeIdx,
		ReuseShuffleIndicies);
		LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
		return;
		}
		case Instruction::Call: {
		@@ -3181,15 +3231,23 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
		}
		case Instruction::Store: {
		// We know that we can merge the stores. Calculate the cost.
		MaybeAlign alignment(cast<StoreInst>(VL0)->getAlignment());
		bool IsReorder = !E->ReorderIndices.empty();
		auto *SI =
		cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
		MaybeAlign Alignment(SI->getAlignment());
		int ScalarEltCost =
		TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0, VL0);
		TTI->getMemoryOpCost(Instruction::Store, ScalarTy, Alignment, 0, VL0);
		if (NeedToShuffleReuses) {
		ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
		}
		int ScalarStCost = VecTy->getNumElements() * ScalarEltCost;
		int VecStCost =
		TTI->getMemoryOpCost(Instruction::Store, VecTy, alignment, 0, VL0);
		int VecStCost = TTI->getMemoryOpCost(Instruction::Store,
		VecTy, Alignment, 0, VL0);
		if (IsReorder) {
		// TODO: Merge this shuffle with the ReuseShuffleCost.
		VecStCost += TTI->getShuffleCost(
		TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
		}
		return ReuseShuffleCost + VecStCost - ScalarStCost;
		}
		case Instruction::Call: {
		@@ -4051,13 +4109,22 @@ Value BoUpSLP::vectorizeTree(TreeEntry E) {
		return V;
		}
		case Instruction::Store: {
		StoreInst *SI = cast<StoreInst>(VL0);
		bool IsReorder = !E->ReorderIndices.empty();
		auto *SI = cast<StoreInst>(
		IsReorder ? E->Scalars[E->ReorderIndices.front()] : VL0);
		unsigned Alignment = SI->getAlignment();
		unsigned AS = SI->getPointerAddressSpace();

		setInsertPointAfterBundle(E);

		Value *VecValue = vectorizeTree(E->getOperand(0));
		if (IsReorder) {
		OrdersType Mask;
		inversePermutation(E->ReorderIndices, Mask);
		VecValue = Builder.CreateShuffleVector(
		VecValue, UndefValue::get(VecValue->getType()), E->ReorderIndices,
		"reorder_shuffle");
		}
		Value *ScalarPtr = SI->getPointerOperand();
		Value *VecPtr = Builder.CreateBitCast(ScalarPtr, VecTy->getPointerTo(AS));
		StoreInst *ST = Builder.CreateStore(VecValue, VecPtr);
		@@ -5347,6 +5414,14 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
		<< "\n");

		R.buildTree(Chain);
		Optional<ArrayRef<unsigned>> Order = R.bestOrder();
		if (Order) {
		// TODO: reorder tree nodes without tree rebuilding.
		SmallVector<Value *, 4> ReorderedOps(Chain.rbegin(), Chain.rend());
		llvm::transform(*Order, ReorderedOps.begin(),
		[Chain](const unsigned Idx) { return Chain[Idx]; });
		R.buildTree(ReorderedOps);
		}
		if (R.isTreeTinyAndNotFullyVectorizable())
		return false;

llvm/test/Transforms/SLPVectorizer/X86/store-jumbled.ll

+3 −4

Original line number	Diff line number	Diff line
		@@ -11,21 +11,20 @@ define i32 @jumbled-load(i32* noalias nocapture %in, i32* noalias nocapture %inn
		; CHECK-NEXT: [[GEP_3:%.]] = getelementptr inbounds i32, i32 [[IN_ADDR]], i64 3
		; CHECK-NEXT: [[TMP1:%.]] = bitcast i32 [[IN_ADDR]] to <4 x i32>*
		; CHECK-NEXT: [[TMP2:%.]] = load <4 x i32>, <4 x i32> [[TMP1]], align 4
		; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 0, i32 2>
		; CHECK-NEXT: [[INN_ADDR:%.]] = getelementptr inbounds i32, i32 [[INN:%.*]], i64 0
		; CHECK-NEXT: [[GEP_4:%.]] = getelementptr inbounds i32, i32 [[INN_ADDR]], i64 1
		; CHECK-NEXT: [[GEP_5:%.]] = getelementptr inbounds i32, i32 [[INN_ADDR]], i64 2
		; CHECK-NEXT: [[GEP_6:%.]] = getelementptr inbounds i32, i32 [[INN_ADDR]], i64 3
		; CHECK-NEXT: [[TMP3:%.]] = bitcast i32 [[INN_ADDR]] to <4 x i32>*
		; CHECK-NEXT: [[TMP4:%.]] = load <4 x i32>, <4 x i32> [[TMP3]], align 4
		; CHECK-NEXT: [[REORDER_SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 0, i32 2>
		; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i32> [[REORDER_SHUFFLE]], [[REORDER_SHUFFLE1]]
		; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i32> [[TMP2]], [[TMP4]]
		; CHECK-NEXT: [[GEP_7:%.]] = getelementptr inbounds i32, i32 [[OUT:%.*]], i64 0
		; CHECK-NEXT: [[GEP_8:%.]] = getelementptr inbounds i32, i32 [[OUT]], i64 1
		; CHECK-NEXT: [[GEP_9:%.]] = getelementptr inbounds i32, i32 [[OUT]], i64 2
		; CHECK-NEXT: [[GEP_10:%.]] = getelementptr inbounds i32, i32 [[OUT]], i64 3
		; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 0, i32 2>
		; CHECK-NEXT: [[TMP6:%.]] = bitcast i32 [[GEP_7]] to <4 x i32>*
		; CHECK-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4
		; CHECK-NEXT: store <4 x i32> [[REORDER_SHUFFLE]], <4 x i32>* [[TMP6]], align 4
		; CHECK-NEXT: ret i32 undef
		;
		%in.addr = getelementptr inbounds i32, i32* %in, i64 0

llvm/test/Transforms/SLPVectorizer/X86/stores_vectorize.ll

+4 −5

Original line number	Diff line number	Diff line
		@@ -92,15 +92,14 @@ define void @store_reverse(i64* %p3) {
		; CHECK-NEXT: [[ARRAYIDX11:%.]] = getelementptr inbounds i64, i64 [[P3]], i64 3
		; CHECK-NEXT: [[TMP0:%.]] = bitcast i64 [[P3]] to <4 x i64>*
		; CHECK-NEXT: [[TMP1:%.]] = load <4 x i64>, <4 x i64> [[TMP0]], align 8
		; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
		; CHECK-NEXT: [[ARRAYIDX12:%.]] = getelementptr inbounds i64, i64 [[P3]], i64 11
		; CHECK-NEXT: [[TMP2:%.]] = bitcast i64 [[ARRAYIDX1]] to <4 x i64>*
		; CHECK-NEXT: [[TMP3:%.]] = load <4 x i64>, <4 x i64> [[TMP2]], align 8
		; CHECK-NEXT: [[REORDER_SHUFFLE1:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
		; CHECK-NEXT: [[TMP4:%.*]] = shl <4 x i64> [[REORDER_SHUFFLE]], [[REORDER_SHUFFLE1]]
		; CHECK-NEXT: [[TMP4:%.*]] = shl <4 x i64> [[TMP1]], [[TMP3]]
		; CHECK-NEXT: [[ARRAYIDX14:%.]] = getelementptr inbounds i64, i64 [[P3]], i64 4
		; CHECK-NEXT: [[TMP5:%.]] = bitcast i64 [[ARRAYIDX14]] to <4 x i64>*
		; CHECK-NEXT: store <4 x i64> [[TMP4]], <4 x i64>* [[TMP5]], align 8
		; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
		; CHECK-NEXT: [[TMP6:%.]] = bitcast i64 [[ARRAYIDX14]] to <4 x i64>*
		; CHECK-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* [[TMP6]], align 8
		; CHECK-NEXT: ret void
		;
		entry:

Original line number	Diff line number	Diff line
		@@ -11,21 +11,20 @@ define i32 @jumbled-load(i32* noalias nocapture %in, i32* noalias nocapture %inn
		; CHECK-NEXT: [[GEP_3:%.]] = getelementptr inbounds i32, i32 [[IN_ADDR]], i64 3
		; CHECK-NEXT: [[TMP1:%.]] = bitcast i32 [[IN_ADDR]] to <4 x i32>*
		; CHECK-NEXT: [[TMP2:%.]] = load <4 x i32>, <4 x i32> [[TMP1]], align 4
		; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 0, i32 2>
		; CHECK-NEXT: [[INN_ADDR:%.]] = getelementptr inbounds i32, i32 [[INN:%.*]], i64 0
		; CHECK-NEXT: [[GEP_4:%.]] = getelementptr inbounds i32, i32 [[INN_ADDR]], i64 1
		; CHECK-NEXT: [[GEP_5:%.]] = getelementptr inbounds i32, i32 [[INN_ADDR]], i64 2
		; CHECK-NEXT: [[GEP_6:%.]] = getelementptr inbounds i32, i32 [[INN_ADDR]], i64 3
		; CHECK-NEXT: [[TMP3:%.]] = bitcast i32 [[INN_ADDR]] to <4 x i32>*
		; CHECK-NEXT: [[TMP4:%.]] = load <4 x i32>, <4 x i32> [[TMP3]], align 4
		; CHECK-NEXT: [[REORDER_SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 0, i32 2>
		; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i32> [[REORDER_SHUFFLE]], [[REORDER_SHUFFLE1]]
		; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i32> [[TMP2]], [[TMP4]]
		; CHECK-NEXT: [[GEP_7:%.]] = getelementptr inbounds i32, i32 [[OUT:%.*]], i64 0
		; CHECK-NEXT: [[GEP_8:%.]] = getelementptr inbounds i32, i32 [[OUT]], i64 1
		; CHECK-NEXT: [[GEP_9:%.]] = getelementptr inbounds i32, i32 [[OUT]], i64 2
		; CHECK-NEXT: [[GEP_10:%.]] = getelementptr inbounds i32, i32 [[OUT]], i64 3
		; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 0, i32 2>
		; CHECK-NEXT: [[TMP6:%.]] = bitcast i32 [[GEP_7]] to <4 x i32>*
		; CHECK-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4
		; CHECK-NEXT: store <4 x i32> [[REORDER_SHUFFLE]], <4 x i32>* [[TMP6]], align 4
		; CHECK-NEXT: ret i32 undef
		;
		%in.addr = getelementptr inbounds i32, i32* %in, i64 0

Admin message