Commit 67659430 authored by Roman Lebedev's avatar Roman Lebedev
Browse files

[CodeGen][SelectionDAG] More efficient code for X % C == 0 (SREM case)

Summary:
This implements an optimization described in Hacker's Delight 10-17:
when `C` is constant, the result of `X % C == 0` can be computed
more cheaply without actually calculating the remainder.
The motivation is discussed here: https://bugs.llvm.org/show_bug.cgi?id=35479.

One huge caveat: this signed case is only valid for positive divisors.

While we can freely negate negative divisors, we can't negate `INT_MIN`,
so for now if `INT_MIN` is encountered, we bailout.
As a follow-up, it should be possible to handle that more gracefully
via extra `and`+`setcc`+`select`.

This passes llvm's test-suite, and from cursory(!) cross-examination
the folds (the assembly) match those of GCC, and manual checking via alive
did not reveal any issues (other than the `INT_MIN` case)

Reviewers: RKSimon, spatel, hermord, craig.topper, xbolva00

Reviewed By: RKSimon, xbolva00

Subscribers: xbolva00, thakis, javed.absar, hiraditya, dexonsmith, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D65366

llvm-svn: 368702
parent f4de7eda
Loading
Loading
Loading
Loading
+8 −0
Original line number Diff line number Diff line
@@ -4164,6 +4164,14 @@ private:
  SDValue buildUREMEqFold(EVT SETCCVT, SDValue REMNode, SDValue CompTargetNode,
                          ISD::CondCode Cond, DAGCombinerInfo &DCI,
                          const SDLoc &DL) const;

  SDValue prepareSREMEqFold(EVT SETCCVT, SDValue REMNode,
                            SDValue CompTargetNode, ISD::CondCode Cond,
                            DAGCombinerInfo &DCI, const SDLoc &DL,
                            SmallVectorImpl<SDNode *> &Created) const;
  SDValue buildSREMEqFold(EVT SETCCVT, SDValue REMNode, SDValue CompTargetNode,
                          ISD::CondCode Cond, DAGCombinerInfo &DCI,
                          const SDLoc &DL) const;
};

/// Given an LLVM IR type and return type attributes, compute the return value
+221 −5
Original line number Diff line number Diff line
@@ -3802,15 +3802,21 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
  }

  // Fold remainder of division by a constant.
  if (N0.getOpcode() == ISD::UREM && N0.hasOneUse() &&
      (Cond == ISD::SETEQ || Cond == ISD::SETNE)) {
  if ((N0.getOpcode() == ISD::UREM || N0.getOpcode() == ISD::SREM) &&
      N0.hasOneUse() && (Cond == ISD::SETEQ || Cond == ISD::SETNE)) {
    AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();

    // When division is cheap or optimizing for minimum size,
    // fall through to DIVREM creation by skipping this fold.
    if (!isIntDivCheap(VT, Attr) && !Attr.hasFnAttribute(Attribute::MinSize))
    if (!isIntDivCheap(VT, Attr) && !Attr.hasFnAttribute(Attribute::MinSize)) {
      if (N0.getOpcode() == ISD::UREM) {
        if (SDValue Folded = buildUREMEqFold(VT, N0, N1, Cond, DCI, dl))
          return Folded;
      } else if (N0.getOpcode() == ISD::SREM) {
        if (SDValue Folded = buildSREMEqFold(VT, N0, N1, Cond, DCI, dl))
          return Folded;
      }
    }
  }

  // Fold away ALL boolean setcc's.
@@ -5004,6 +5010,216 @@ TargetLowering::prepareUREMEqFold(EVT SETCCVT, SDValue REMNode,
                      ((Cond == ISD::SETEQ) ? ISD::SETULE : ISD::SETUGT));
}

/// Given an ISD::SREM used only by an ISD::SETEQ or ISD::SETNE
/// where the divisor is constant and the comparison target is zero,
/// return a DAG expression that will generate the same comparison result
/// using only multiplications, additions and shifts/rotations.
/// Ref: "Hacker's Delight" 10-17.
SDValue TargetLowering::buildSREMEqFold(EVT SETCCVT, SDValue REMNode,
                                        SDValue CompTargetNode,
                                        ISD::CondCode Cond,
                                        DAGCombinerInfo &DCI,
                                        const SDLoc &DL) const {
  SmallVector<SDNode *, 3> Built;
  if (SDValue Folded = prepareSREMEqFold(SETCCVT, REMNode, CompTargetNode, Cond,
                                         DCI, DL, Built)) {
    for (SDNode *N : Built)
      DCI.AddToWorklist(N);
    return Folded;
  }

  return SDValue();
}

SDValue
TargetLowering::prepareSREMEqFold(EVT SETCCVT, SDValue REMNode,
                                  SDValue CompTargetNode, ISD::CondCode Cond,
                                  DAGCombinerInfo &DCI, const SDLoc &DL,
                                  SmallVectorImpl<SDNode *> &Created) const {
  // Fold:
  //   (seteq/ne (srem N, D), 0)
  // To:
  //   (setule/ugt (rotr (add (mul N, P), A), K), Q)
  //
  // - D must be constant, with D = D0 * 2^K where D0 is odd
  // - P is the multiplicative inverse of D0 modulo 2^W
  // - A = bitwiseand(floor((2^(W - 1) - 1) / D0), (-(2^k)))
  // - Q = floor((2 * A) / (2^K))
  // where W is the width of the common type of N and D.
  assert((Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
         "Only applicable for (in)equality comparisons.");

  SelectionDAG &DAG = DCI.DAG;

  EVT VT = REMNode.getValueType();
  EVT SVT = VT.getScalarType();
  EVT ShVT = getShiftAmountTy(VT, DAG.getDataLayout());
  EVT ShSVT = ShVT.getScalarType();

  // If MUL is unavailable, we cannot proceed in any case.
  if (!isOperationLegalOrCustom(ISD::MUL, VT))
    return SDValue();

  // TODO: Could support comparing with non-zero too.
  ConstantSDNode *CompTarget = isConstOrConstSplat(CompTargetNode);
  if (!CompTarget || !CompTarget->isNullValue())
    return SDValue();

  bool HadOneDivisor = false;
  bool AllDivisorsAreOnes = true;
  bool HadEvenDivisor = false;
  bool NeedToApplyOffset = false;
  bool AllDivisorsArePowerOfTwo = true;
  SmallVector<SDValue, 16> PAmts, AAmts, KAmts, QAmts;

  auto BuildSREMPattern = [&](ConstantSDNode *C) {
    // Division by 0 is UB. Leave it to be constant-folded elsewhere.
    if (C->isNullValue())
      return false;

    // FIXME: we don't fold `rem %X, -C` to `rem %X, C` in DAGCombine.

    // WARNING: this fold is only valid for positive divisors!
    APInt D = C->getAPIntValue();
    if (D.isMinSignedValue())
      return false; // We can't negate INT_MIN.
    if (D.isNegative())
      D.negate(); //  `rem %X, -C` is equivalent to `rem %X, C`

    assert(!D.isNegative() && "The fold is only valid for positive divisors!");

    // If all divisors are ones, we will prefer to avoid the fold.
    HadOneDivisor |= D.isOneValue();
    AllDivisorsAreOnes &= D.isOneValue();

    // Decompose D into D0 * 2^K
    unsigned K = D.countTrailingZeros();
    assert((!D.isOneValue() || (K == 0)) && "For divisor '1' we won't rotate.");
    APInt D0 = D.lshr(K);

    // D is even if it has trailing zeros.
    HadEvenDivisor |= (K != 0);
    // D is a power-of-two if D0 is one.
    // If all divisors are power-of-two, we will prefer to avoid the fold.
    AllDivisorsArePowerOfTwo &= D0.isOneValue();

    // P = inv(D0, 2^W)
    // 2^W requires W + 1 bits, so we have to extend and then truncate.
    unsigned W = D.getBitWidth();
    APInt P = D0.zext(W + 1)
                  .multiplicativeInverse(APInt::getSignedMinValue(W + 1))
                  .trunc(W);
    assert(!P.isNullValue() && "No multiplicative inverse!"); // unreachable
    assert((D0 * P).isOneValue() && "Multiplicative inverse sanity check.");

    // A = floor((2^(W - 1) - 1) / D0) & -2^K
    APInt A = APInt::getSignedMaxValue(W).udiv(D0);
    A.clearLowBits(K);

    NeedToApplyOffset |= A != 0;

    // Q = floor((2 * A) / (2^K))
    APInt Q = (2 * A).udiv(APInt::getOneBitSet(W, K));

    assert(APInt::getAllOnesValue(SVT.getSizeInBits()).ugt(A) &&
           "We are expecting that A is always less than all-ones for SVT");
    assert(APInt::getAllOnesValue(ShSVT.getSizeInBits()).ugt(K) &&
           "We are expecting that K is always less than all-ones for ShSVT");

    // If the divisor is 1 the result can be constant-folded.
    if (D.isOneValue()) {
      // Set P, A and K to a bogus values so we can try to splat them.
      P = 0;
      A = -1;
      K = -1;

      // x ?% 1 == 0  <-->  true  <-->  x u<= -1
      Q = -1;
    }

    PAmts.push_back(DAG.getConstant(P, DL, SVT));
    AAmts.push_back(DAG.getConstant(A, DL, SVT));
    KAmts.push_back(
        DAG.getConstant(APInt(ShSVT.getSizeInBits(), K), DL, ShSVT));
    QAmts.push_back(DAG.getConstant(Q, DL, SVT));
    return true;
  };

  SDValue N = REMNode.getOperand(0);
  SDValue D = REMNode.getOperand(1);

  // Collect the values from each element.
  if (!ISD::matchUnaryPredicate(D, BuildSREMPattern))
    return SDValue();

  // If this is a srem by a one, avoid the fold since it can be constant-folded.
  if (AllDivisorsAreOnes)
    return SDValue();

  // If this is a srem by a powers-of-two, avoid the fold since it can be
  // best implemented as a bit test.
  if (AllDivisorsArePowerOfTwo)
    return SDValue();

  SDValue PVal, AVal, KVal, QVal;
  if (VT.isVector()) {
    if (HadOneDivisor) {
      // Try to turn PAmts into a splat, since we don't care about the values
      // that are currently '0'. If we can't, just keep '0'`s.
      turnVectorIntoSplatVector(PAmts, isNullConstant);
      // Try to turn AAmts into a splat, since we don't care about the
      // values that are currently '-1'. If we can't, change them to '0'`s.
      turnVectorIntoSplatVector(AAmts, isAllOnesConstant,
                                DAG.getConstant(0, DL, SVT));
      // Try to turn KAmts into a splat, since we don't care about the values
      // that are currently '-1'. If we can't, change them to '0'`s.
      turnVectorIntoSplatVector(KAmts, isAllOnesConstant,
                                DAG.getConstant(0, DL, ShSVT));
    }

    PVal = DAG.getBuildVector(VT, DL, PAmts);
    AVal = DAG.getBuildVector(VT, DL, AAmts);
    KVal = DAG.getBuildVector(ShVT, DL, KAmts);
    QVal = DAG.getBuildVector(VT, DL, QAmts);
  } else {
    PVal = PAmts[0];
    AVal = AAmts[0];
    KVal = KAmts[0];
    QVal = QAmts[0];
  }

  // (mul N, P)
  SDValue Op0 = DAG.getNode(ISD::MUL, DL, VT, N, PVal);
  Created.push_back(Op0.getNode());

  if (NeedToApplyOffset) {
    // We need ADD to do this.
    if (!isOperationLegalOrCustom(ISD::ADD, VT))
      return SDValue();

    // (add (mul N, P), A)
    Op0 = DAG.getNode(ISD::ADD, DL, VT, Op0, AVal);
    Created.push_back(Op0.getNode());
  }

  // Rotate right only if any divisor was even. We avoid rotates for all-odd
  // divisors as a performance improvement, since rotating by 0 is a no-op.
  if (HadEvenDivisor) {
    // We need ROTR to do this.
    if (!isOperationLegalOrCustom(ISD::ROTR, VT))
      return SDValue();
    SDNodeFlags Flags;
    Flags.setExact(true);
    // SREM: (rotr (add (mul N, P), A), K)
    Op0 = DAG.getNode(ISD::ROTR, DL, VT, Op0, KVal, Flags);
    Created.push_back(Op0.getNode());
  }

  // SREM: (setule/setugt (rotr (add (mul N, P), A), K), Q)
  return DAG.getSetCC(DL, SETCCVT, Op0, QVal,
                      ((Cond == ISD::SETEQ) ? ISD::SETULE : ISD::SETUGT));
}

bool TargetLowering::
verifyReturnAddressArgumentIsConstant(SDValue Op, SelectionDAG &DAG) const {
  if (!isa<ConstantSDNode>(Op.getOperand(0))) {
+9 −10
Original line number Diff line number Diff line
@@ -21,17 +21,16 @@ define i32 @test_minsize(i32 %X) optsize minsize nounwind readnone {
define i32 @test_optsize(i32 %X) optsize nounwind readnone {
; CHECK-LABEL: test_optsize:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w8, #26215
; CHECK-NEXT:    movk w8, #26214, lsl #16
; CHECK-NEXT:    smull x8, w0, w8
; CHECK-NEXT:    lsr x10, x8, #63
; CHECK-NEXT:    asr x8, x8, #33
; CHECK-NEXT:    add w8, w8, w10
; CHECK-NEXT:    add w8, w8, w8, lsl #2
; CHECK-NEXT:    mov w9, #-10
; CHECK-NEXT:    cmp w0, w8
; CHECK-NEXT:    mov w8, #52429
; CHECK-NEXT:    mov w9, #39321
; CHECK-NEXT:    movk w8, #52428, lsl #16
; CHECK-NEXT:    movk w9, #6553, lsl #16
; CHECK-NEXT:    mov w10, #858993459
; CHECK-NEXT:    madd w8, w0, w8, w9
; CHECK-NEXT:    mov w11, #-10
; CHECK-NEXT:    cmp w8, w10
; CHECK-NEXT:    mov w8, #42
; CHECK-NEXT:    csel w0, w8, w9, eq
; CHECK-NEXT:    csel w0, w8, w11, lo
; CHECK-NEXT:    ret
  %rem = srem i32 %X, 5
  %cmp = icmp eq i32 %rem, 0
+40 −85
Original line number Diff line number Diff line
@@ -37,27 +37,16 @@ define <4 x i32> @test_srem_odd_even(<4 x i32> %X) nounwind {
define <4 x i32> @test_srem_odd_allones_eq(<4 x i32> %X) nounwind {
; CHECK-LABEL: test_srem_odd_allones_eq:
; CHECK:       // %bb.0:
; CHECK-NEXT:    adrp x8, .LCPI1_0
; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI1_0]
; CHECK-NEXT:    adrp x8, .LCPI1_1
; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI1_1]
; CHECK-NEXT:    adrp x8, .LCPI1_2
; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI1_2]
; CHECK-NEXT:    adrp x8, .LCPI1_3
; CHECK-NEXT:    smull2 v4.2d, v0.4s, v1.4s
; CHECK-NEXT:    smull v1.2d, v0.2s, v1.2s
; CHECK-NEXT:    uzp2 v1.4s, v1.4s, v4.4s
; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI1_3]
; CHECK-NEXT:    adrp x8, .LCPI1_4
; CHECK-NEXT:    mla v1.4s, v0.4s, v2.4s
; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI1_4]
; CHECK-NEXT:    neg v3.4s, v3.4s
; CHECK-NEXT:    sshl v3.4s, v1.4s, v3.4s
; CHECK-NEXT:    ushr v1.4s, v1.4s, #31
; CHECK-NEXT:    and v1.16b, v1.16b, v4.16b
; CHECK-NEXT:    add v1.4s, v3.4s, v1.4s
; CHECK-NEXT:    mls v0.4s, v1.4s, v2.4s
; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
; CHECK-NEXT:    adrp x10, .LCPI1_0
; CHECK-NEXT:    mov w8, #52429
; CHECK-NEXT:    mov w9, #39321
; CHECK-NEXT:    ldr q1, [x10, :lo12:.LCPI1_0]
; CHECK-NEXT:    movk w8, #52428, lsl #16
; CHECK-NEXT:    movk w9, #6553, lsl #16
; CHECK-NEXT:    dup v2.4s, w8
; CHECK-NEXT:    dup v3.4s, w9
; CHECK-NEXT:    mla v3.4s, v0.4s, v2.4s
; CHECK-NEXT:    cmhs v0.4s, v1.4s, v3.4s
; CHECK-NEXT:    movi v1.4s, #1
; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
; CHECK-NEXT:    ret
@@ -69,28 +58,16 @@ define <4 x i32> @test_srem_odd_allones_eq(<4 x i32> %X) nounwind {
define <4 x i32> @test_srem_odd_allones_ne(<4 x i32> %X) nounwind {
; CHECK-LABEL: test_srem_odd_allones_ne:
; CHECK:       // %bb.0:
; CHECK-NEXT:    adrp x8, .LCPI2_0
; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI2_0]
; CHECK-NEXT:    adrp x8, .LCPI2_1
; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI2_1]
; CHECK-NEXT:    adrp x8, .LCPI2_2
; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI2_2]
; CHECK-NEXT:    adrp x8, .LCPI2_3
; CHECK-NEXT:    smull2 v4.2d, v0.4s, v1.4s
; CHECK-NEXT:    smull v1.2d, v0.2s, v1.2s
; CHECK-NEXT:    uzp2 v1.4s, v1.4s, v4.4s
; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI2_3]
; CHECK-NEXT:    adrp x8, .LCPI2_4
; CHECK-NEXT:    mla v1.4s, v0.4s, v2.4s
; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI2_4]
; CHECK-NEXT:    neg v3.4s, v3.4s
; CHECK-NEXT:    sshl v3.4s, v1.4s, v3.4s
; CHECK-NEXT:    ushr v1.4s, v1.4s, #31
; CHECK-NEXT:    and v1.16b, v1.16b, v4.16b
; CHECK-NEXT:    add v1.4s, v3.4s, v1.4s
; CHECK-NEXT:    mls v0.4s, v1.4s, v2.4s
; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
; CHECK-NEXT:    mvn v0.16b, v0.16b
; CHECK-NEXT:    adrp x10, .LCPI2_0
; CHECK-NEXT:    mov w8, #52429
; CHECK-NEXT:    mov w9, #39321
; CHECK-NEXT:    ldr q1, [x10, :lo12:.LCPI2_0]
; CHECK-NEXT:    movk w8, #52428, lsl #16
; CHECK-NEXT:    movk w9, #6553, lsl #16
; CHECK-NEXT:    dup v2.4s, w8
; CHECK-NEXT:    dup v3.4s, w9
; CHECK-NEXT:    mla v3.4s, v0.4s, v2.4s
; CHECK-NEXT:    cmhi v0.4s, v3.4s, v1.4s
; CHECK-NEXT:    movi v1.4s, #1
; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
; CHECK-NEXT:    ret
@@ -327,27 +304,16 @@ define <4 x i32> @test_srem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
define <4 x i32> @test_srem_odd_one(<4 x i32> %X) nounwind {
; CHECK-LABEL: test_srem_odd_one:
; CHECK:       // %bb.0:
; CHECK-NEXT:    adrp x8, .LCPI10_0
; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI10_0]
; CHECK-NEXT:    adrp x8, .LCPI10_1
; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI10_1]
; CHECK-NEXT:    adrp x8, .LCPI10_2
; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI10_2]
; CHECK-NEXT:    adrp x8, .LCPI10_3
; CHECK-NEXT:    smull2 v4.2d, v0.4s, v1.4s
; CHECK-NEXT:    smull v1.2d, v0.2s, v1.2s
; CHECK-NEXT:    uzp2 v1.4s, v1.4s, v4.4s
; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI10_3]
; CHECK-NEXT:    adrp x8, .LCPI10_4
; CHECK-NEXT:    mla v1.4s, v0.4s, v2.4s
; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI10_4]
; CHECK-NEXT:    neg v3.4s, v3.4s
; CHECK-NEXT:    sshl v3.4s, v1.4s, v3.4s
; CHECK-NEXT:    ushr v1.4s, v1.4s, #31
; CHECK-NEXT:    and v1.16b, v1.16b, v4.16b
; CHECK-NEXT:    add v1.4s, v3.4s, v1.4s
; CHECK-NEXT:    mls v0.4s, v1.4s, v2.4s
; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
; CHECK-NEXT:    adrp x10, .LCPI10_0
; CHECK-NEXT:    mov w8, #52429
; CHECK-NEXT:    mov w9, #39321
; CHECK-NEXT:    ldr q1, [x10, :lo12:.LCPI10_0]
; CHECK-NEXT:    movk w8, #52428, lsl #16
; CHECK-NEXT:    movk w9, #6553, lsl #16
; CHECK-NEXT:    dup v2.4s, w8
; CHECK-NEXT:    dup v3.4s, w9
; CHECK-NEXT:    mla v3.4s, v0.4s, v2.4s
; CHECK-NEXT:    cmhs v0.4s, v1.4s, v3.4s
; CHECK-NEXT:    movi v1.4s, #1
; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
; CHECK-NEXT:    ret
@@ -625,27 +591,16 @@ define <4 x i32> @test_srem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi
define <4 x i32> @test_srem_odd_allones_and_one(<4 x i32> %X) nounwind {
; CHECK-LABEL: test_srem_odd_allones_and_one:
; CHECK:       // %bb.0:
; CHECK-NEXT:    adrp x8, .LCPI19_0
; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI19_0]
; CHECK-NEXT:    adrp x8, .LCPI19_1
; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI19_1]
; CHECK-NEXT:    adrp x8, .LCPI19_2
; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI19_2]
; CHECK-NEXT:    adrp x8, .LCPI19_3
; CHECK-NEXT:    smull2 v4.2d, v0.4s, v1.4s
; CHECK-NEXT:    smull v1.2d, v0.2s, v1.2s
; CHECK-NEXT:    uzp2 v1.4s, v1.4s, v4.4s
; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI19_3]
; CHECK-NEXT:    adrp x8, .LCPI19_4
; CHECK-NEXT:    mla v1.4s, v0.4s, v2.4s
; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI19_4]
; CHECK-NEXT:    neg v3.4s, v3.4s
; CHECK-NEXT:    sshl v3.4s, v1.4s, v3.4s
; CHECK-NEXT:    ushr v1.4s, v1.4s, #31
; CHECK-NEXT:    and v1.16b, v1.16b, v4.16b
; CHECK-NEXT:    add v1.4s, v3.4s, v1.4s
; CHECK-NEXT:    mls v0.4s, v1.4s, v2.4s
; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
; CHECK-NEXT:    adrp x10, .LCPI19_0
; CHECK-NEXT:    mov w8, #52429
; CHECK-NEXT:    mov w9, #39321
; CHECK-NEXT:    ldr q1, [x10, :lo12:.LCPI19_0]
; CHECK-NEXT:    movk w8, #52428, lsl #16
; CHECK-NEXT:    movk w9, #6553, lsl #16
; CHECK-NEXT:    dup v2.4s, w8
; CHECK-NEXT:    dup v3.4s, w9
; CHECK-NEXT:    mla v3.4s, v0.4s, v2.4s
; CHECK-NEXT:    cmhs v0.4s, v1.4s, v3.4s
; CHECK-NEXT:    movi v1.4s, #1
; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
; CHECK-NEXT:    ret
+22 −22
Original line number Diff line number Diff line
@@ -5,17 +5,17 @@
define <4 x i32> @test_srem_odd_25(<4 x i32> %X) nounwind {
; CHECK-LABEL: test_srem_odd_25:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w8, #34079
; CHECK-NEXT:    movk w8, #20971, lsl #16
; CHECK-NEXT:    dup v2.4s, w8
; CHECK-NEXT:    smull2 v3.2d, v0.4s, v2.4s
; CHECK-NEXT:    smull v2.2d, v0.2s, v2.2s
; CHECK-NEXT:    uzp2 v2.4s, v2.4s, v3.4s
; CHECK-NEXT:    sshr v3.4s, v2.4s, #3
; CHECK-NEXT:    movi v1.4s, #25
; CHECK-NEXT:    usra v3.4s, v2.4s, #31
; CHECK-NEXT:    mls v0.4s, v3.4s, v1.4s
; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
; CHECK-NEXT:    mov w8, #23593
; CHECK-NEXT:    mov w9, #47185
; CHECK-NEXT:    movk w8, #49807, lsl #16
; CHECK-NEXT:    movk w9, #1310, lsl #16
; CHECK-NEXT:    mov w10, #28834
; CHECK-NEXT:    movk w10, #2621, lsl #16
; CHECK-NEXT:    dup v1.4s, w8
; CHECK-NEXT:    dup v2.4s, w9
; CHECK-NEXT:    dup v3.4s, w10
; CHECK-NEXT:    mla v2.4s, v0.4s, v1.4s
; CHECK-NEXT:    cmhs v0.4s, v3.4s, v2.4s
; CHECK-NEXT:    movi v1.4s, #1
; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
; CHECK-NEXT:    ret
@@ -55,17 +55,17 @@ define <4 x i32> @test_srem_even_100(<4 x i32> %X) nounwind {
define <4 x i32> @test_srem_odd_neg25(<4 x i32> %X) nounwind {
; CHECK-LABEL: test_srem_odd_neg25:
; CHECK:       // %bb.0:
; CHECK-NEXT:    adrp x8, .LCPI2_0
; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI2_0]
; CHECK-NEXT:    adrp x8, .LCPI2_1
; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI2_1]
; CHECK-NEXT:    smull2 v3.2d, v0.4s, v1.4s
; CHECK-NEXT:    smull v1.2d, v0.2s, v1.2s
; CHECK-NEXT:    uzp2 v1.4s, v1.4s, v3.4s
; CHECK-NEXT:    sshr v3.4s, v1.4s, #3
; CHECK-NEXT:    usra v3.4s, v1.4s, #31
; CHECK-NEXT:    mls v0.4s, v3.4s, v2.4s
; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
; CHECK-NEXT:    mov w8, #23593
; CHECK-NEXT:    mov w9, #47185
; CHECK-NEXT:    movk w8, #49807, lsl #16
; CHECK-NEXT:    movk w9, #1310, lsl #16
; CHECK-NEXT:    mov w10, #28834
; CHECK-NEXT:    movk w10, #2621, lsl #16
; CHECK-NEXT:    dup v1.4s, w8
; CHECK-NEXT:    dup v2.4s, w9
; CHECK-NEXT:    dup v3.4s, w10
; CHECK-NEXT:    mla v2.4s, v0.4s, v1.4s
; CHECK-NEXT:    cmhs v0.4s, v3.4s, v2.4s
; CHECK-NEXT:    movi v1.4s, #1
; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
; CHECK-NEXT:    ret
Loading