Unverified Commit c9140eb1 authored by Paul Walker's avatar Paul Walker Committed by GitHub
Browse files

[LLVM][X86InstCombine] Extend mask combines to cover ConstantInt/FP based splats. (#195090)

parent 6e2cb043
Loading
Loading
Loading
Loading
+4 −4
Original line number Diff line number Diff line
@@ -40,8 +40,8 @@ static Constant *getNegativeIsTrueBoolVec(Constant *V, const DataLayout &DL) {
/// each element's most significant bit (the sign bit).
static Value *getBoolVecFromMask(Value *Mask, const DataLayout &DL) {
  // Fold Constant Mask.
  if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask))
    return getNegativeIsTrueBoolVec(ConstantMask, DL);
  if (isa<ConstantInt, ConstantFP, ConstantDataVector>(Mask))
    return getNegativeIsTrueBoolVec(cast<Constant>(Mask), DL);

  // Mask was extended from a boolean vector.
  Value *ExtMask;
@@ -2973,9 +2973,9 @@ X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
    }

    // Constant Mask - select 1st/2nd argument lane based on top bit of mask.
    if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) {
    if (isa<ConstantInt, ConstantFP, ConstantDataVector>(Mask)) {
      Constant *NewSelector =
          getNegativeIsTrueBoolVec(ConstantMask, IC.getDataLayout());
          getNegativeIsTrueBoolVec(cast<Constant>(Mask), IC.getDataLayout());
      return SelectInst::Create(NewSelector, Op1, Op0, "blendv");
    }
    unsigned BitWidth = Mask->getType()->getScalarSizeInBits();
+50 −1
Original line number Diff line number Diff line
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -passes=instcombine -mtriple=x86_64-apple-macosx -mcpu=core-avx2 -S | FileCheck %s
; RUN: opt < %s -passes=instcombine -mtriple=x86_64-apple-macosx -mcpu=core-avx2 -use-constant-int-for-fixed-length-splat=false -use-constant-fp-for-fixed-length-splat=false -S | FileCheck %s
; RUN: opt < %s -passes=instcombine -mtriple=x86_64-apple-macosx -mcpu=core-avx2 -use-constant-int-for-fixed-length-splat -use-constant-fp-for-fixed-length-splat -S | FileCheck %s

define <2 x double> @constant_blendvpd(<2 x double> %xy, <2 x double> %ab) {
; CHECK-LABEL: @constant_blendvpd(
@@ -18,6 +19,14 @@ define <2 x double> @constant_blendvpd_zero(<2 x double> %xy, <2 x double> %ab)
  ret <2 x double> %1
}

define <2 x double> @constant_blendvpd_nzero(<2 x double> %xy, <2 x double> %ab) {
; CHECK-LABEL: @constant_blendvpd_nzero(
; CHECK-NEXT:    ret <2 x double> [[AB:%.*]]
;
  %1 = tail call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %xy, <2 x double> %ab, <2 x double> splat (double -0.000000e+00))
  ret <2 x double> %1
}

define <2 x double> @constant_blendvpd_dup(<2 x double> %xy, <2 x double> %sel) {
; CHECK-LABEL: @constant_blendvpd_dup(
; CHECK-NEXT:    ret <2 x double> [[XY:%.*]]
@@ -43,6 +52,14 @@ define <4 x float> @constant_blendvps_zero(<4 x float> %xyzw, <4 x float> %abcd)
  ret <4 x float> %1
}

define <4 x float> @constant_blendvps_nzero(<4 x float> %xyzw, <4 x float> %abcd) {
; CHECK-LABEL: @constant_blendvps_nzero(
; CHECK-NEXT:    ret <4 x float> [[ABCD:%.*]]
;
  %1 = tail call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %xyzw, <4 x float> %abcd, <4 x float> splat (float -0.000000e+00))
  ret <4 x float> %1
}

define <4 x float> @constant_blendvps_dup(<4 x float> %xyzw, <4 x float> %sel) {
; CHECK-LABEL: @constant_blendvps_dup(
; CHECK-NEXT:    ret <4 x float> [[XYZW:%.*]]
@@ -68,6 +85,14 @@ define <16 x i8> @constant_pblendvb_zero(<16 x i8> %xyzw, <16 x i8> %abcd) {
  ret <16 x i8> %1
}

define <16 x i8> @constant_pblendvb_all_ones(<16 x i8> %xyzw, <16 x i8> %abcd) {
; CHECK-LABEL: @constant_pblendvb_all_ones(
; CHECK-NEXT:    ret <16 x i8> [[ABCD:%.*]]
;
  %1 = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %xyzw, <16 x i8> %abcd, <16 x i8> splat (i8 -1))
  ret <16 x i8> %1
}

define <16 x i8> @constant_pblendvb_dup(<16 x i8> %xyzw, <16 x i8> %sel) {
; CHECK-LABEL: @constant_pblendvb_dup(
; CHECK-NEXT:    ret <16 x i8> [[XYZW:%.*]]
@@ -93,6 +118,14 @@ define <4 x double> @constant_blendvpd_avx_zero(<4 x double> %xy, <4 x double> %
  ret <4 x double> %1
}

define <4 x double> @constant_blendvpd_avx_nzero(<4 x double> %xy, <4 x double> %ab) {
; CHECK-LABEL: @constant_blendvpd_avx_nzero(
; CHECK-NEXT:    ret <4 x double> [[AB:%.*]]
;
  %1 = tail call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %xy, <4 x double> %ab, <4 x double> splat (double -0.000000e+00))
  ret <4 x double> %1
}

define <4 x double> @constant_blendvpd_avx_dup(<4 x double> %xy, <4 x double> %sel) {
; CHECK-LABEL: @constant_blendvpd_avx_dup(
; CHECK-NEXT:    ret <4 x double> [[XY:%.*]]
@@ -118,6 +151,14 @@ define <8 x float> @constant_blendvps_avx_zero(<8 x float> %xyzw, <8 x float> %a
  ret <8 x float> %1
}

define <8 x float> @constant_blendvps_avx_nzero(<8 x float> %xyzw, <8 x float> %abcd) {
; CHECK-LABEL: @constant_blendvps_avx_nzero(
; CHECK-NEXT:    ret <8 x float> [[ABCD:%.*]]
;
  %1 = tail call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %xyzw, <8 x float> %abcd, <8 x float> splat (float -0.000000e+00))
  ret <8 x float> %1
}

define <8 x float> @constant_blendvps_avx_dup(<8 x float> %xyzw, <8 x float> %sel) {
; CHECK-LABEL: @constant_blendvps_avx_dup(
; CHECK-NEXT:    ret <8 x float> [[XYZW:%.*]]
@@ -147,6 +188,14 @@ define <32 x i8> @constant_pblendvb_avx2_zero(<32 x i8> %xyzw, <32 x i8> %abcd)
  ret <32 x i8> %1
}

define <32 x i8> @constant_pblendvb_avx2_all_ones(<32 x i8> %xyzw, <32 x i8> %abcd) {
; CHECK-LABEL: @constant_pblendvb_avx2_all_ones(
; CHECK-NEXT:    ret <32 x i8> [[ABCD:%.*]]
;
  %1 = tail call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %xyzw, <32 x i8> %abcd, <32 x i8> splat (i8 -1))
  ret <32 x i8> %1
}

define <32 x i8> @constant_pblendvb_avx2_dup(<32 x i8> %xyzw, <32 x i8> %sel) {
; CHECK-LABEL: @constant_pblendvb_avx2_dup(
; CHECK-NEXT:    ret <32 x i8> [[XYZW:%.*]]
+19 −1
Original line number Diff line number Diff line
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -passes=instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s
; RUN: opt < %s -passes=instcombine -mtriple=x86_64-unknown-unknown -use-constant-int-for-fixed-length-splat=false -S | FileCheck %s
; RUN: opt < %s -passes=instcombine -mtriple=x86_64-unknown-unknown -use-constant-int-for-fixed-length-splat -S | FileCheck %s

;; MASKED LOADS

@@ -48,6 +49,14 @@ define <4 x float> @mload_fake_ones(ptr %f) {
  ret <4 x float> %ld
}

define <4 x float> @mload_fake_ones_splat(ptr %f) {
; CHECK-LABEL: @mload_fake_ones_splat(
; CHECK-NEXT:    ret <4 x float> zeroinitializer
;
  %ld = tail call <4 x float> @llvm.x86.avx.maskload.ps(ptr %f, <4 x i32> splat(i32 1))
  ret <4 x float> %ld
}

; All mask bits are set, so this is just a vector load.

define <4 x float> @mload_real_ones(ptr %f) {
@@ -59,6 +68,15 @@ define <4 x float> @mload_real_ones(ptr %f) {
  ret <4 x float> %ld
}

define <4 x float> @mload_real_ones_splat(ptr %f) {
; CHECK-LABEL: @mload_real_ones_splat(
; CHECK-NEXT:    [[UNMASKEDLOAD:%.*]] = load <4 x float>, ptr [[F:%.*]], align 1
; CHECK-NEXT:    ret <4 x float> [[UNMASKEDLOAD]]
;
  %ld = tail call <4 x float> @llvm.x86.avx.maskload.ps(ptr %f, <4 x i32> splat(i32 -1))
  ret <4 x float> %ld
}

; It's a constant mask, so convert to an LLVM intrinsic. The backend should optimize further.

define <4 x float> @mload_one_one(ptr %f) {