[SLP]Fix PR70004: Do not change insert point for reduction gather nodes. (529aa6ea) · Commits · llvm-doe / llvm-project

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

+2 −1

Original line number	Diff line number	Diff line
		@@ -10118,7 +10118,8 @@ Value BoUpSLP::vectorizeTree(TreeEntry E) {
		}

		if (E->State == TreeEntry::NeedToGather) {
		if (E->getMainOp() && E->Idx == 0)
		// Set insert point for non-reduction initial nodes.
		if (E->getMainOp() && E->Idx == 0 && !UserIgnoreList)
		setInsertPointAfterBundle(E);
		Value *Vec = createBuildVector(E);
		E->VectorizedValue = Vec;

llvm/test/Transforms/SLPVectorizer/X86/reduction-gather-non-scheduled-extracts.ll

0 → 100644

+44 −0

Original line number	Diff line number	Diff line
		; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
		; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-sie-ps5 < %s \| FileCheck %s

		define void @tes() {
		; CHECK-LABEL: define void @tes() {
		; CHECK-NEXT: entry:
		; CHECK-NEXT: [[TMP0:%.*]] = fcmp ole <2 x double> zeroinitializer, zeroinitializer
		; CHECK-NEXT: br label [[TMP1:%.*]]
		; CHECK: 1:
		; CHECK-NEXT: [[TMP2:%.*]] = select i1 false, i1 false, i1 false
		; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i1> zeroinitializer, <2 x i1> [[TMP0]], <4 x i32> <i32 0, i32 0, i32 0, i32 2>
		; CHECK-NEXT: [[TMP4:%.*]] = freeze <4 x i1> [[TMP3]]
		; CHECK-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP4]])
		; CHECK-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP5]], i1 false, i1 false
		; CHECK-NEXT: [[OP_RDX1:%.*]] = select i1 [[TMP2]], i1 [[OP_RDX]], i1 false
		; CHECK-NEXT: br i1 [[OP_RDX1]], label [[TMP6:%.]], label [[TMP7:%.]]
		; CHECK: 6:
		; CHECK-NEXT: ret void
		; CHECK: 7:
		; CHECK-NEXT: ret void
		;
		entry:
		%0 = extractelement <2 x i1> zeroinitializer, i64 0
		%1 = extractelement <2 x i1> zeroinitializer, i64 0
		%2 = fcmp ole <2 x double> zeroinitializer, zeroinitializer
		%3 = extractelement <2 x i1> %2, i64 0
		%4 = extractelement <2 x i1> zeroinitializer, i64 0
		br label %5

		5:
		%6 = select i1 false, i1 false, i1 false
		%7 = select i1 %6, i1 %0, i1 false
		%8 = select i1 %7, i1 %1, i1 false
		%9 = select i1 %8, i1 false, i1 false
		%10 = select i1 %9, i1 %3, i1 false
		%11 = select i1 %10, i1 %4, i1 false
		br i1 %11, label %12, label %13

		12:
		ret void

		13:
		ret void
		}

llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll

+20 −20

Original line number	Diff line number	Diff line
		@@ -18,11 +18,11 @@
		define i32 @reduce_and4(i32 %acc, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3, <4 x i32> %v4) {
		; SSE2-LABEL: @reduce_and4(
		; SSE2-NEXT: entry:
		; SSE2-NEXT: [[TMP0:%.]] = shufflevector <4 x i32> [[V2:%.]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
		; SSE2-NEXT: [[TMP1:%.]] = shufflevector <4 x i32> [[V4:%.]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
		; SSE2-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP1]])
		; SSE2-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP0]])
		; SSE2-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP2]], [[TMP3]]
		; SSE2-NEXT: [[TMP0:%.]] = shufflevector <4 x i32> [[V4:%.]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
		; SSE2-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP0]])
		; SSE2-NEXT: [[TMP2:%.]] = shufflevector <4 x i32> [[V2:%.]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
		; SSE2-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP2]])
		; SSE2-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP1]], [[TMP3]]
		; SSE2-NEXT: [[OP_RDX1:%.]] = and i32 [[OP_RDX]], [[ACC:%.]]
		; SSE2-NEXT: ret i32 [[OP_RDX1]]
		;
		@@ -40,11 +40,11 @@ define i32 @reduce_and4(i32 %acc, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3, <
		;
		; AVX-LABEL: @reduce_and4(
		; AVX-NEXT: entry:
		; AVX-NEXT: [[TMP0:%.]] = shufflevector <4 x i32> [[V2:%.]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
		; AVX-NEXT: [[TMP1:%.]] = shufflevector <4 x i32> [[V4:%.]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
		; AVX-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP1]])
		; AVX-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP0]])
		; AVX-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP2]], [[TMP3]]
		; AVX-NEXT: [[TMP0:%.]] = shufflevector <4 x i32> [[V4:%.]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
		; AVX-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP0]])
		; AVX-NEXT: [[TMP2:%.]] = shufflevector <4 x i32> [[V2:%.]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
		; AVX-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP2]])
		; AVX-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP1]], [[TMP3]]
		; AVX-NEXT: [[OP_RDX1:%.]] = and i32 [[OP_RDX]], [[ACC:%.]]
		; AVX-NEXT: ret i32 [[OP_RDX1]]
		;
		@@ -94,11 +94,11 @@ entry:

		define i32 @reduce_and4_transpose(i32 %acc, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3, <4 x i32> %v4) {
		; SSE2-LABEL: @reduce_and4_transpose(
		; SSE2-NEXT: [[TMP1:%.]] = shufflevector <4 x i32> [[V2:%.]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
		; SSE2-NEXT: [[TMP2:%.]] = shufflevector <4 x i32> [[V4:%.]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
		; SSE2-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP2]])
		; SSE2-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP1]])
		; SSE2-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP3]], [[TMP4]]
		; SSE2-NEXT: [[TMP1:%.]] = shufflevector <4 x i32> [[V4:%.]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
		; SSE2-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP1]])
		; SSE2-NEXT: [[TMP3:%.]] = shufflevector <4 x i32> [[V2:%.]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
		; SSE2-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP3]])
		; SSE2-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP2]], [[TMP4]]
		; SSE2-NEXT: [[OP_RDX1:%.]] = and i32 [[OP_RDX]], [[ACC:%.]]
		; SSE2-NEXT: ret i32 [[OP_RDX1]]
		;
		@@ -114,11 +114,11 @@ define i32 @reduce_and4_transpose(i32 %acc, <4 x i32> %v1, <4 x i32> %v2, <4 x i
		; SSE42-NEXT: ret i32 [[OP_RDX3]]
		;
		; AVX-LABEL: @reduce_and4_transpose(
		; AVX-NEXT: [[TMP1:%.]] = shufflevector <4 x i32> [[V2:%.]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
		; AVX-NEXT: [[TMP2:%.]] = shufflevector <4 x i32> [[V4:%.]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
		; AVX-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP2]])
		; AVX-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP1]])
		; AVX-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP3]], [[TMP4]]
		; AVX-NEXT: [[TMP1:%.]] = shufflevector <4 x i32> [[V4:%.]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
		; AVX-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP1]])
		; AVX-NEXT: [[TMP3:%.]] = shufflevector <4 x i32> [[V2:%.]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
		; AVX-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP3]])
		; AVX-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP2]], [[TMP4]]
		; AVX-NEXT: [[OP_RDX1:%.]] = and i32 [[OP_RDX]], [[ACC:%.]]
		; AVX-NEXT: ret i32 [[OP_RDX1]]
		;