[LLVM][X86InstCombine] Extend mask combines to cover ConstantInt/FP based splats. (#195090) (c9140eb1) · Commits · llvm-doe / llvm-project

llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp

+4 −4

Original line number	Diff line number	Diff line
		@@ -40,8 +40,8 @@ static Constant getNegativeIsTrueBoolVec(Constant V, const DataLayout &DL) {
		/// each element's most significant bit (the sign bit).
		static Value getBoolVecFromMask(Value Mask, const DataLayout &DL) {
		// Fold Constant Mask.
		if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask))
		return getNegativeIsTrueBoolVec(ConstantMask, DL);
		if (isa<ConstantInt, ConstantFP, ConstantDataVector>(Mask))
		return getNegativeIsTrueBoolVec(cast<Constant>(Mask), DL);

		// Mask was extended from a boolean vector.
		Value *ExtMask;
		@@ -2973,9 +2973,9 @@ X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
		}

		// Constant Mask - select 1st/2nd argument lane based on top bit of mask.
		if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) {
		if (isa<ConstantInt, ConstantFP, ConstantDataVector>(Mask)) {
		Constant *NewSelector =
		getNegativeIsTrueBoolVec(ConstantMask, IC.getDataLayout());
		getNegativeIsTrueBoolVec(cast<Constant>(Mask), IC.getDataLayout());
		return SelectInst::Create(NewSelector, Op1, Op0, "blendv");
		}
		unsigned BitWidth = Mask->getType()->getScalarSizeInBits();

llvm/test/Transforms/InstCombine/X86/blend_x86.ll

+50 −1

Original line number	Diff line number	Diff line
		; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
		; RUN: opt < %s -passes=instcombine -mtriple=x86_64-apple-macosx -mcpu=core-avx2 -S \| FileCheck %s
		; RUN: opt < %s -passes=instcombine -mtriple=x86_64-apple-macosx -mcpu=core-avx2 -use-constant-int-for-fixed-length-splat=false -use-constant-fp-for-fixed-length-splat=false -S \| FileCheck %s
		; RUN: opt < %s -passes=instcombine -mtriple=x86_64-apple-macosx -mcpu=core-avx2 -use-constant-int-for-fixed-length-splat -use-constant-fp-for-fixed-length-splat -S \| FileCheck %s

		define <2 x double> @constant_blendvpd(<2 x double> %xy, <2 x double> %ab) {
		; CHECK-LABEL: @constant_blendvpd(
		@@ -18,6 +19,14 @@ define <2 x double> @constant_blendvpd_zero(<2 x double> %xy, <2 x double> %ab)
		ret <2 x double> %1
		}

		define <2 x double> @constant_blendvpd_nzero(<2 x double> %xy, <2 x double> %ab) {
		; CHECK-LABEL: @constant_blendvpd_nzero(
		; CHECK-NEXT: ret <2 x double> [[AB:%.*]]
		;
		%1 = tail call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %xy, <2 x double> %ab, <2 x double> splat (double -0.000000e+00))
		ret <2 x double> %1
		}

		define <2 x double> @constant_blendvpd_dup(<2 x double> %xy, <2 x double> %sel) {
		; CHECK-LABEL: @constant_blendvpd_dup(
		; CHECK-NEXT: ret <2 x double> [[XY:%.*]]
		@@ -43,6 +52,14 @@ define <4 x float> @constant_blendvps_zero(<4 x float> %xyzw, <4 x float> %abcd)
		ret <4 x float> %1
		}

		define <4 x float> @constant_blendvps_nzero(<4 x float> %xyzw, <4 x float> %abcd) {
		; CHECK-LABEL: @constant_blendvps_nzero(
		; CHECK-NEXT: ret <4 x float> [[ABCD:%.*]]
		;
		%1 = tail call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %xyzw, <4 x float> %abcd, <4 x float> splat (float -0.000000e+00))
		ret <4 x float> %1
		}

		define <4 x float> @constant_blendvps_dup(<4 x float> %xyzw, <4 x float> %sel) {
		; CHECK-LABEL: @constant_blendvps_dup(
		; CHECK-NEXT: ret <4 x float> [[XYZW:%.*]]
		@@ -68,6 +85,14 @@ define <16 x i8> @constant_pblendvb_zero(<16 x i8> %xyzw, <16 x i8> %abcd) {
		ret <16 x i8> %1
		}

		define <16 x i8> @constant_pblendvb_all_ones(<16 x i8> %xyzw, <16 x i8> %abcd) {
		; CHECK-LABEL: @constant_pblendvb_all_ones(
		; CHECK-NEXT: ret <16 x i8> [[ABCD:%.*]]
		;
		%1 = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %xyzw, <16 x i8> %abcd, <16 x i8> splat (i8 -1))
		ret <16 x i8> %1
		}

		define <16 x i8> @constant_pblendvb_dup(<16 x i8> %xyzw, <16 x i8> %sel) {
		; CHECK-LABEL: @constant_pblendvb_dup(
		; CHECK-NEXT: ret <16 x i8> [[XYZW:%.*]]
		@@ -93,6 +118,14 @@ define <4 x double> @constant_blendvpd_avx_zero(<4 x double> %xy, <4 x double> %
		ret <4 x double> %1
		}

		define <4 x double> @constant_blendvpd_avx_nzero(<4 x double> %xy, <4 x double> %ab) {
		; CHECK-LABEL: @constant_blendvpd_avx_nzero(
		; CHECK-NEXT: ret <4 x double> [[AB:%.*]]
		;
		%1 = tail call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %xy, <4 x double> %ab, <4 x double> splat (double -0.000000e+00))
		ret <4 x double> %1
		}

		define <4 x double> @constant_blendvpd_avx_dup(<4 x double> %xy, <4 x double> %sel) {
		; CHECK-LABEL: @constant_blendvpd_avx_dup(
		; CHECK-NEXT: ret <4 x double> [[XY:%.*]]
		@@ -118,6 +151,14 @@ define <8 x float> @constant_blendvps_avx_zero(<8 x float> %xyzw, <8 x float> %a
		ret <8 x float> %1
		}

		define <8 x float> @constant_blendvps_avx_nzero(<8 x float> %xyzw, <8 x float> %abcd) {
		; CHECK-LABEL: @constant_blendvps_avx_nzero(
		; CHECK-NEXT: ret <8 x float> [[ABCD:%.*]]
		;
		%1 = tail call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %xyzw, <8 x float> %abcd, <8 x float> splat (float -0.000000e+00))
		ret <8 x float> %1
		}

		define <8 x float> @constant_blendvps_avx_dup(<8 x float> %xyzw, <8 x float> %sel) {
		; CHECK-LABEL: @constant_blendvps_avx_dup(
		; CHECK-NEXT: ret <8 x float> [[XYZW:%.*]]
		@@ -147,6 +188,14 @@ define <32 x i8> @constant_pblendvb_avx2_zero(<32 x i8> %xyzw, <32 x i8> %abcd)
		ret <32 x i8> %1
		}

		define <32 x i8> @constant_pblendvb_avx2_all_ones(<32 x i8> %xyzw, <32 x i8> %abcd) {
		; CHECK-LABEL: @constant_pblendvb_avx2_all_ones(
		; CHECK-NEXT: ret <32 x i8> [[ABCD:%.*]]
		;
		%1 = tail call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %xyzw, <32 x i8> %abcd, <32 x i8> splat (i8 -1))
		ret <32 x i8> %1
		}

		define <32 x i8> @constant_pblendvb_avx2_dup(<32 x i8> %xyzw, <32 x i8> %sel) {
		; CHECK-LABEL: @constant_pblendvb_avx2_dup(
		; CHECK-NEXT: ret <32 x i8> [[XYZW:%.*]]

llvm/test/Transforms/InstCombine/X86/x86-masked-memops.ll

+19 −1

Original line number	Diff line number	Diff line
		; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
		; RUN: opt < %s -passes=instcombine -mtriple=x86_64-unknown-unknown -S \| FileCheck %s
		; RUN: opt < %s -passes=instcombine -mtriple=x86_64-unknown-unknown -use-constant-int-for-fixed-length-splat=false -S \| FileCheck %s
		; RUN: opt < %s -passes=instcombine -mtriple=x86_64-unknown-unknown -use-constant-int-for-fixed-length-splat -S \| FileCheck %s

		;; MASKED LOADS

		@@ -48,6 +49,14 @@ define <4 x float> @mload_fake_ones(ptr %f) {
		ret <4 x float> %ld
		}

		define <4 x float> @mload_fake_ones_splat(ptr %f) {
		; CHECK-LABEL: @mload_fake_ones_splat(
		; CHECK-NEXT: ret <4 x float> zeroinitializer
		;
		%ld = tail call <4 x float> @llvm.x86.avx.maskload.ps(ptr %f, <4 x i32> splat(i32 1))
		ret <4 x float> %ld
		}

		; All mask bits are set, so this is just a vector load.

		define <4 x float> @mload_real_ones(ptr %f) {
		@@ -59,6 +68,15 @@ define <4 x float> @mload_real_ones(ptr %f) {
		ret <4 x float> %ld
		}

		define <4 x float> @mload_real_ones_splat(ptr %f) {
		; CHECK-LABEL: @mload_real_ones_splat(
		; CHECK-NEXT: [[UNMASKEDLOAD:%.]] = load <4 x float>, ptr [[F:%.]], align 1
		; CHECK-NEXT: ret <4 x float> [[UNMASKEDLOAD]]
		;
		%ld = tail call <4 x float> @llvm.x86.avx.maskload.ps(ptr %f, <4 x i32> splat(i32 -1))
		ret <4 x float> %ld
		}

		; It's a constant mask, so convert to an LLVM intrinsic. The backend should optimize further.

		define <4 x float> @mload_one_one(ptr %f) {