Commit 3f087e38 authored by Simon Pilgrim's avatar Simon Pilgrim
Browse files

[X86][SSE] combineX86ShufflesRecursively - at Depth==0, only resolve KnownZero...

[X86][SSE] combineX86ShufflesRecursively - at Depth==0, only resolve KnownZero if it removes an input.

This stops infinite loops where KnownUndef elements are converted to Zeroable, resulting in KnownZero elements which are then simplified (via SimplifyDemandedElts etc.) back to KnownUndef elements........

Prep fix for PR43024 which will allow rL368307 to be re-applied.
parent 3fbd1c00
Loading
Loading
Loading
Loading
+31 −6
Original line number Diff line number Diff line
@@ -6891,7 +6891,8 @@ static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask,
// Replace target shuffle mask elements with known undef/zero sentinels.
static void resolveTargetShuffleFromZeroables(SmallVectorImpl<int> &Mask,
                                              const APInt &KnownUndef,
                                              const APInt &KnownZero) {
                                              const APInt &KnownZero,
                                              bool ResolveKnownZeros= true) {
  unsigned NumElts = Mask.size();
  assert(KnownUndef.getBitWidth() == NumElts &&
         KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch");
@@ -6899,7 +6900,7 @@ static void resolveTargetShuffleFromZeroables(SmallVectorImpl<int> &Mask,
  for (unsigned i = 0; i != NumElts; ++i) {
    if (KnownUndef[i])
      Mask[i] = SM_SentinelUndef;
    else if (KnownZero[i])
    else if (ResolveKnownZeros && KnownZero[i])
      Mask[i] = SM_SentinelZero;
  }
}
@@ -33071,17 +33072,36 @@ static SDValue combineX86ShufflesRecursively(
                              OpZero, DAG, Depth, false))
    return SDValue();
  resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);
  SmallVector<int, 64> Mask;
  SmallVector<SDValue, 16> Ops;
  // We don't need to merge masks if the root is empty.
  bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1);
  if (EmptyRoot) {
    // Only resolve zeros if it will remove an input, otherwise we might end
    // up in an infinite loop.
    bool ResolveKnownZeros = true;
    if (!OpZero.isNullValue()) {
      APInt UsedInputs = APInt::getNullValue(OpInputs.size());
      for (int i = 0, e = OpMask.size(); i != e; ++i) {
        int M = OpMask[i];
        if (OpUndef[i] || OpZero[i] || isUndefOrZero(M))
          continue;
        UsedInputs.setBit(M / OpMask.size());
        if (UsedInputs.isAllOnesValue()) {
          ResolveKnownZeros = false;
          break;
        }
      }
    }
    resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero,
                                      ResolveKnownZeros);
    Mask = OpMask;
    Ops.append(OpInputs.begin(), OpInputs.end());
  } else {
    resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);
    // Add the inputs to the Ops list, avoiding duplicates.
    Ops.append(SrcOps.begin(), SrcOps.end());
@@ -33216,13 +33236,18 @@ static SDValue combineX86ShufflesRecursively(
  // the remaining recursion depth.
  if (Ops.size() < (MaxRecursionDepth - Depth)) {
    for (int i = 0, e = Ops.size(); i < e; ++i) {
      // For empty roots, we need to resolve zeroable elements before combining
      // them with other shuffles.
      SmallVector<int, 64> ResolvedMask = Mask;
      if (EmptyRoot)
        resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero);
      bool AllowVar = false;
      if (Ops[i].getNode()->hasOneUse() ||
          SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
        AllowVar = AllowVariableMask;
      if (SDValue Res = combineX86ShufflesRecursively(
              Ops, i, Root, Mask, CombinedNodes, Depth + 1, HasVariableMask,
              AllowVar, DAG, Subtarget))
              Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1,
              HasVariableMask, AllowVar, DAG, Subtarget))
        return Res;
    }
  }
+62 −0
Original line number Diff line number Diff line
@@ -2981,3 +2981,65 @@ define <8 x i16> @shuffle_extract_concat_insert(<4 x i16> %lhsa, <4 x i16> %rhsa
  %7 = insertelement <8 x i16> %6, i16 %b15, i32 7
  ret <8 x i16> %7
}

define void @PR43024() {
; SSE2-LABEL: PR43024:
; SSE2:       # %bb.0:
; SSE2-NEXT:    movaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
; SSE2-NEXT:    movaps %xmm0, (%rax)
; SSE2-NEXT:    movaps %xmm0, %xmm1
; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3]
; SSE2-NEXT:    addss %xmm0, %xmm1
; SSE2-NEXT:    xorps %xmm0, %xmm0
; SSE2-NEXT:    addss %xmm0, %xmm1
; SSE2-NEXT:    addss %xmm0, %xmm1
; SSE2-NEXT:    movss %xmm1, (%rax)
; SSE2-NEXT:    retq
;
; SSSE3-LABEL: PR43024:
; SSSE3:       # %bb.0:
; SSSE3-NEXT:    movaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
; SSSE3-NEXT:    movaps %xmm0, (%rax)
; SSSE3-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSSE3-NEXT:    addss %xmm0, %xmm1
; SSSE3-NEXT:    xorps %xmm0, %xmm0
; SSSE3-NEXT:    addss %xmm0, %xmm1
; SSSE3-NEXT:    addss %xmm0, %xmm1
; SSSE3-NEXT:    movss %xmm1, (%rax)
; SSSE3-NEXT:    retq
;
; SSE41-LABEL: PR43024:
; SSE41:       # %bb.0:
; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
; SSE41-NEXT:    movaps %xmm0, (%rax)
; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSE41-NEXT:    addss %xmm0, %xmm1
; SSE41-NEXT:    xorps %xmm0, %xmm0
; SSE41-NEXT:    addss %xmm0, %xmm1
; SSE41-NEXT:    addss %xmm0, %xmm1
; SSE41-NEXT:    movss %xmm1, (%rax)
; SSE41-NEXT:    retq
;
; AVX-LABEL: PR43024:
; AVX:       # %bb.0:
; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
; AVX-NEXT:    vmovaps %xmm0, (%rax)
; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
; AVX-NEXT:    vmovss %xmm0, (%rax)
; AVX-NEXT:    retq
  store <4 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000, float 0x0, float 0x0>, <4 x float>* undef, align 16
  %1 = load <4 x float>, <4 x float>* undef, align 16
  %2 = fmul <4 x float> %1, <float 0x0, float 0x0, float 0x0, float 0x0>
  %3 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
  %4 = fadd <4 x float> %2, %3
  %5 = fadd <4 x float> zeroinitializer, %4
  %6 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
  %7 = fadd <4 x float> %6, %5
  %8 = extractelement <4 x float> %7, i32 0
  store float %8, float* undef, align 8
  ret void
}