Unverified Commit 696406fb authored by Marius Kamp's avatar Marius Kamp Committed by GitHub
Browse files

[SelectionDAG] Use Magic Algorithm for Splitting UDIV/UREM by Constant (#154968)

For integer types twice as large as a legal type, we have previously
generated a library call if another splitting technique was not
applicable.
    
With this change, we use an adaption of the Magic algorithm. This
algorithm is also used for UDIV/UREM by constants on legal types. The
implementation introduced here is a simple port of the already existing
implementation to types twice the size of a legal type. The core idea of
this algorithm is to replace (udiv x c) for a constant c with the bits
higher or equal to the s-th bit of the multiplication of x by (2^s +
o)/c
for some s and o. More details are available in Henry S. Warren, Jr.:
"Hacker's Delight", chapter 10.
    
An efficient handling of UDIV/UREM by constants on types twice as large
as a legal type is mostly relevant for 32-bit platforms. But some
projects may also benefit on 64-bit platforms. For example, the `fmt`
library for C++ uses 128-bit unsigned divisions by 100 and 10000, which
have not been covered by the previously existing optimizations.
    
Closes #137514.
parent 66401c2f
Loading
Loading
Loading
Loading
+15 −4
Original line number Diff line number Diff line
@@ -5538,10 +5538,12 @@ public:
                 SDValue LL = SDValue(), SDValue LH = SDValue(),
                 SDValue RL = SDValue(), SDValue RH = SDValue()) const;

  /// Attempt to expand an n-bit div/rem/divrem by constant using a n/2-bit
  /// urem by constant and other arithmetic ops. The n/2-bit urem by constant
  /// will be expanded by DAGCombiner. This is not possible for all constant
  /// divisors.
  /// Attempt to expand an n-bit div/rem/divrem by constant using an n/2-bit
  /// algorithm. First, attempt to expand the division using a n/2-bit urem by
  /// constant and other arithmetic ops. The n/2-bit urem by constant will be
  /// expanded by DAGCombiner. As this is not possible for all constant
  /// divisors, this method falls back to an implementation of the magic
  /// algorithm using n/2-bit operations.
  /// \param N Node to expand
  /// \param Result A vector that will be filled with the lo and high parts of
  ///        the results. For *DIVREM, this will be the quotient parts followed
@@ -6039,6 +6041,15 @@ private:
  SDValue buildSREMEqFold(EVT SETCCVT, SDValue REMNode, SDValue CompTargetNode,
                          ISD::CondCode Cond, DAGCombinerInfo &DCI,
                          const SDLoc &DL) const;

  bool expandUDIVREMByConstantViaUREMDecomposition(
      SDNode *N, APInt Divisor, SmallVectorImpl<SDValue> &Result, EVT HiLoVT,
      SelectionDAG &DAG, SDValue LL, SDValue LH) const;

  bool expandUDIVREMByConstantViaUMulHiMagic(SDNode *N, const APInt &Divisor,
                                             SmallVectorImpl<SDValue> &Result,
                                             EVT HiLoVT, SelectionDAG &DAG,
                                             SDValue LL, SDValue LH) const;
};

/// Given an LLVM IR type and return type attributes, compute the return value
+155 −30
Original line number Diff line number Diff line
@@ -8106,44 +8106,17 @@ bool TargetLowering::expandMUL(SDNode *N, SDValue &Lo, SDValue &Hi, EVT HiLoVT,
// dividend and multiply by the multiplicative inverse of the shifted divisor.
// If we want the remainder, we shift the value left by the number of trailing
// zeros and add the bits that were shifted out of the dividend.
bool TargetLowering::expandDIVREMByConstant(SDNode *N,
                                            SmallVectorImpl<SDValue> &Result,
                                            EVT HiLoVT, SelectionDAG &DAG,
                                            SDValue LL, SDValue LH) const {
bool TargetLowering::expandUDIVREMByConstantViaUREMDecomposition(
    SDNode *N, APInt Divisor, SmallVectorImpl<SDValue> &Result, EVT HiLoVT,
    SelectionDAG &DAG, SDValue LL, SDValue LH) const {
  unsigned Opcode = N->getOpcode();
  EVT VT = N->getValueType(0);
  // TODO: Support signed division/remainder.
  if (Opcode == ISD::SREM || Opcode == ISD::SDIV || Opcode == ISD::SDIVREM)
    return false;
  assert(
      (Opcode == ISD::UREM || Opcode == ISD::UDIV || Opcode == ISD::UDIVREM) &&
      "Unexpected opcode");
  auto *CN = dyn_cast<ConstantSDNode>(N->getOperand(1));
  if (!CN)
    return false;
  APInt Divisor = CN->getAPIntValue();
  unsigned BitWidth = Divisor.getBitWidth();
  unsigned HBitWidth = BitWidth / 2;
  assert(VT.getScalarSizeInBits() == BitWidth &&
         HiLoVT.getScalarSizeInBits() == HBitWidth && "Unexpected VTs");
  // We depend on the UREM by constant optimization in DAGCombiner that requires
  // high multiply.
  if (!isOperationLegalOrCustom(ISD::MULHU, HiLoVT) &&
      !isOperationLegalOrCustom(ISD::UMUL_LOHI, HiLoVT))
    return false;
  // Don't expand if optimizing for size.
  if (DAG.shouldOptForSize())
    return false;
  // Early out for 0 or 1 divisors.
  if (Divisor.ule(1))
    return false;
  // If the divisor is even, shift it until it becomes odd.
  unsigned TrailingZeros = 0;
  if (!Divisor[0]) {
@@ -8398,6 +8371,158 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
  return true;
}
bool TargetLowering::expandUDIVREMByConstantViaUMulHiMagic(
    SDNode *N, const APInt &Divisor, SmallVectorImpl<SDValue> &Result,
    EVT HiLoVT, SelectionDAG &DAG, SDValue LL, SDValue LH) const {
  SDValue N0 = N->getOperand(0);
  EVT VT = N0->getValueType(0);
  SDLoc DL{N};
  assert(!Divisor.isOne() && "Magic algorithm does not work for division by 1");
  // This helper creates a MUL_LOHI of the pair (LL, LH) by a constant.
  auto MakeMUL_LOHIByConst = [&](unsigned Opc, SDValue LL, SDValue LH,
                                 const APInt &Const,
                                 SmallVectorImpl<SDValue> &Result) {
    SDValue LHS = DAG.getNode(ISD::BUILD_PAIR, DL, VT, LL, LH);
    SDValue RHS = DAG.getConstant(Const, DL, VT);
    auto [RL, RH] = DAG.SplitScalar(RHS, DL, HiLoVT, HiLoVT);
    return expandMUL_LOHI(Opc, VT, DL, LHS, RHS, Result, HiLoVT, DAG,
                          TargetLowering::MulExpansionKind::OnlyLegalOrCustom,
                          LL, LH, RL, RH);
  };
  // This helper creates an ADD/SUB of the pairs (LL, LH) and (RL, RH).
  auto MakeAddSubLong = [&](unsigned Opc, SDValue LL, SDValue LH, SDValue RL,
                            SDValue RH) {
    SDValue AddSubNode =
        DAG.getNode(Opc == ISD::ADD ? ISD::UADDO : ISD::USUBO, DL,
                    DAG.getVTList(HiLoVT, MVT::i1), LL, RL);
    SDValue OutL = AddSubNode.getValue(0);
    SDValue Overflow = AddSubNode.getValue(1);
    SDValue AddSubWithOverflow =
        DAG.getNode(Opc == ISD::ADD ? ISD::UADDO_CARRY : ISD::USUBO_CARRY, DL,
                    DAG.getVTList(HiLoVT, MVT::i1), LH, RH, Overflow);
    SDValue OutH = AddSubWithOverflow.getValue(0);
    return std::make_pair(OutL, OutH);
  };
  // This helper creates a SRL of the pair (LL, LH) by Shift.
  auto MakeSRLLong = [&](SDValue LL, SDValue LH, unsigned Shift) {
    unsigned HBitWidth = HiLoVT.getScalarSizeInBits();
    if (Shift < HBitWidth) {
      SDValue ShAmt = DAG.getShiftAmountConstant(Shift, HiLoVT, DL);
      SDValue ResL = DAG.getNode(ISD::FSHR, DL, HiLoVT, LH, LL, ShAmt);
      SDValue ResH = DAG.getNode(ISD::SRL, DL, HiLoVT, LH, ShAmt);
      return std::make_pair(ResL, ResH);
    }
    SDValue Zero = DAG.getConstant(0, DL, HiLoVT);
    if (Shift == HBitWidth)
      return std::make_pair(LH, Zero);
    assert(Shift - HBitWidth < HBitWidth &&
           "We shouldn't generate an undefined shift");
    SDValue ShAmt = DAG.getShiftAmountConstant(Shift - HBitWidth, HiLoVT, DL);
    return std::make_pair(DAG.getNode(ISD::SRL, DL, HiLoVT, LH, ShAmt), Zero);
  };
  // Knowledge of leading zeros may help to reduce the multiplier.
  unsigned KnownLeadingZeros = DAG.computeKnownBits(N0).countMinLeadingZeros();
  UnsignedDivisionByConstantInfo Magics = UnsignedDivisionByConstantInfo::get(
      Divisor, std::min(KnownLeadingZeros, Divisor.countl_zero()));
  assert(!LL == !LH && "Expected both input halves or no input halves!");
  if (!LL)
    std::tie(LL, LH) = DAG.SplitScalar(N0, DL, HiLoVT, HiLoVT);
  SDValue QL = LL;
  SDValue QH = LH;
  if (Magics.PreShift != 0)
    std::tie(QL, QH) = MakeSRLLong(QL, QH, Magics.PreShift);
  SmallVector<SDValue, 4> UMulResult;
  if (!MakeMUL_LOHIByConst(ISD::UMUL_LOHI, QL, QH, Magics.Magic, UMulResult))
    return false;
  QL = UMulResult[2];
  QH = UMulResult[3];
  if (Magics.IsAdd) {
    auto [NPQL, NPQH] = MakeAddSubLong(ISD::SUB, LL, LH, QL, QH);
    std::tie(NPQL, NPQH) = MakeSRLLong(NPQL, NPQH, 1);
    std::tie(QL, QH) = MakeAddSubLong(ISD::ADD, NPQL, NPQH, QL, QH);
  }
  if (Magics.PostShift != 0)
    std::tie(QL, QH) = MakeSRLLong(QL, QH, Magics.PostShift);
  unsigned Opcode = N->getOpcode();
  if (Opcode != ISD::UREM) {
    Result.push_back(QL);
    Result.push_back(QH);
  }
  if (Opcode != ISD::UDIV) {
    SmallVector<SDValue, 2> MulResult;
    if (!MakeMUL_LOHIByConst(ISD::MUL, QL, QH, Divisor, MulResult))
      return false;
    assert(MulResult.size() == 2);
    auto [RemL, RemH] =
        MakeAddSubLong(ISD::SUB, LL, LH, MulResult[0], MulResult[1]);
    Result.push_back(RemL);
    Result.push_back(RemH);
  }
  return true;
}
bool TargetLowering::expandDIVREMByConstant(SDNode *N,
                                            SmallVectorImpl<SDValue> &Result,
                                            EVT HiLoVT, SelectionDAG &DAG,
                                            SDValue LL, SDValue LH) const {
  unsigned Opcode = N->getOpcode();
  // TODO: Support signed division/remainder.
  if (Opcode == ISD::SREM || Opcode == ISD::SDIV || Opcode == ISD::SDIVREM)
    return false;
  assert(
      (Opcode == ISD::UREM || Opcode == ISD::UDIV || Opcode == ISD::UDIVREM) &&
      "Unexpected opcode");
  auto *CN = dyn_cast<ConstantSDNode>(N->getOperand(1));
  if (!CN)
    return false;
  APInt Divisor = CN->getAPIntValue();
  // We depend on the UREM by constant optimization in DAGCombiner that requires
  // high multiply.
  if (!isOperationLegalOrCustom(ISD::MULHU, HiLoVT) &&
      !isOperationLegalOrCustom(ISD::UMUL_LOHI, HiLoVT))
    return false;
  // Don't expand if optimizing for size.
  if (DAG.shouldOptForSize())
    return false;
  // Early out for 0 or 1 divisors.
  if (Divisor.ule(1))
    return false;
  if (expandUDIVREMByConstantViaUREMDecomposition(N, Divisor, Result, HiLoVT,
                                                  DAG, LL, LH))
    return true;
  if (expandUDIVREMByConstantViaUMulHiMagic(N, Divisor, Result, HiLoVT, DAG, LL,
                                            LH))
    return true;
  return false;
}
// Check that (every element of) Z is undef or not an exact multiple of BW.
static bool isNonZeroModBitWidthOrUndef(SDValue Z, unsigned BW) {
  return ISD::matchUnaryPredicate(
+134 −71
Original line number Diff line number Diff line
@@ -778,9 +778,8 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) nounwind {
  ret <4 x i16> %1
}

; Don't fold i64 urem.
define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind {
; RV32I-LABEL: dont_fold_urem_i64:
define <4 x i64> @fold_urem_i64(<4 x i64> %x) nounwind {
; RV32I-LABEL: fold_urem_i64:
; RV32I:       # %bb.0:
; RV32I-NEXT:    addi sp, sp, -48
; RV32I-NEXT:    sw ra, 44(sp) # 4-byte Folded Spill
@@ -850,83 +849,147 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind {
; RV32I-NEXT:    addi sp, sp, 48
; RV32I-NEXT:    ret
;
; RV32IM-LABEL: dont_fold_urem_i64:
; RV32IM-LABEL: fold_urem_i64:
; RV32IM:       # %bb.0:
; RV32IM-NEXT:    addi sp, sp, -48
; RV32IM-NEXT:    sw ra, 44(sp) # 4-byte Folded Spill
; RV32IM-NEXT:    sw s0, 40(sp) # 4-byte Folded Spill
; RV32IM-NEXT:    sw s1, 36(sp) # 4-byte Folded Spill
; RV32IM-NEXT:    sw s2, 32(sp) # 4-byte Folded Spill
; RV32IM-NEXT:    sw s3, 28(sp) # 4-byte Folded Spill
; RV32IM-NEXT:    sw s4, 24(sp) # 4-byte Folded Spill
; RV32IM-NEXT:    sw s5, 20(sp) # 4-byte Folded Spill
; RV32IM-NEXT:    sw s6, 16(sp) # 4-byte Folded Spill
; RV32IM-NEXT:    sw s7, 12(sp) # 4-byte Folded Spill
; RV32IM-NEXT:    addi sp, sp, -32
; RV32IM-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
; RV32IM-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
; RV32IM-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
; RV32IM-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
; RV32IM-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
; RV32IM-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
; RV32IM-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
; RV32IM-NEXT:    mv a2, a1
; RV32IM-NEXT:    mv s0, a0
; RV32IM-NEXT:    lw a2, 16(a1)
; RV32IM-NEXT:    lw a4, 20(a1)
; RV32IM-NEXT:    lw s1, 24(a1)
; RV32IM-NEXT:    lw s2, 28(a1)
; RV32IM-NEXT:    lw a7, 16(a1)
; RV32IM-NEXT:    lw a6, 20(a1)
; RV32IM-NEXT:    lw a3, 24(a1)
; RV32IM-NEXT:    lw a5, 28(a1)
; RV32IM-NEXT:    lw a0, 0(a1)
; RV32IM-NEXT:    lw a3, 4(a1)
; RV32IM-NEXT:    lw s3, 8(a1)
; RV32IM-NEXT:    lw s4, 12(a1)
; RV32IM-NEXT:    lui a1, 1024
; RV32IM-NEXT:    slli a5, a4, 10
; RV32IM-NEXT:    srli a6, a2, 22
; RV32IM-NEXT:    or a5, a6, a5
; RV32IM-NEXT:    lui a6, 45590
; RV32IM-NEXT:    addi a1, a1, -1
; RV32IM-NEXT:    addi a6, a6, 1069
; RV32IM-NEXT:    and a2, a2, a1
; RV32IM-NEXT:    srli a4, a4, 12
; RV32IM-NEXT:    add a2, a2, a4
; RV32IM-NEXT:    and a1, a5, a1
; RV32IM-NEXT:    add a1, a2, a1
; RV32IM-NEXT:    mulhu a2, a1, a6
; RV32IM-NEXT:    li a4, 23
; RV32IM-NEXT:    mul a2, a2, a4
; RV32IM-NEXT:    sub s7, a1, a2
; RV32IM-NEXT:    lw a1, 4(a1)
; RV32IM-NEXT:    lw a4, 8(a2)
; RV32IM-NEXT:    lw a2, 12(a2)
; RV32IM-NEXT:    lui t0, 410452
; RV32IM-NEXT:    lui t1, 25653
; RV32IM-NEXT:    lui t2, 791991
; RV32IM-NEXT:    lui t3, 834723
; RV32IM-NEXT:    lui t4, 1024
; RV32IM-NEXT:    addi t0, t0, -952
; RV32IM-NEXT:    addi t1, t1, 965
; RV32IM-NEXT:    addi t2, t2, 77
; RV32IM-NEXT:    addi t3, t3, -179
; RV32IM-NEXT:    addi t4, t4, -1
; RV32IM-NEXT:    srli t5, a4, 1
; RV32IM-NEXT:    slli t6, a2, 31
; RV32IM-NEXT:    srli s1, a2, 1
; RV32IM-NEXT:    mul s2, a3, t2
; RV32IM-NEXT:    and s3, a7, t4
; RV32IM-NEXT:    slli s4, a6, 10
; RV32IM-NEXT:    srli a7, a7, 22
; RV32IM-NEXT:    srli a6, a6, 12
; RV32IM-NEXT:    or t5, t6, t5
; RV32IM-NEXT:    mul t6, s1, t1
; RV32IM-NEXT:    mulhu s5, s1, t1
; RV32IM-NEXT:    or a7, a7, s4
; RV32IM-NEXT:    mul s4, s1, t0
; RV32IM-NEXT:    mulhu s1, s1, t0
; RV32IM-NEXT:    add a6, s3, a6
; RV32IM-NEXT:    mul s3, t5, t0
; RV32IM-NEXT:    mulhu t1, t5, t1
; RV32IM-NEXT:    mulhu t0, t5, t0
; RV32IM-NEXT:    mulhu t5, a3, t3
; RV32IM-NEXT:    and a7, a7, t4
; RV32IM-NEXT:    mul t4, a5, t3
; RV32IM-NEXT:    add s2, t5, s2
; RV32IM-NEXT:    add t4, s2, t4
; RV32IM-NEXT:    sltu t5, s2, t5
; RV32IM-NEXT:    sltu t4, t4, s2
; RV32IM-NEXT:    mulhu s2, a3, t2
; RV32IM-NEXT:    add t5, s2, t5
; RV32IM-NEXT:    add a6, a6, a7
; RV32IM-NEXT:    add s3, t1, s3
; RV32IM-NEXT:    add t6, s3, t6
; RV32IM-NEXT:    sltu a7, s3, t1
; RV32IM-NEXT:    sltu t1, t6, s3
; RV32IM-NEXT:    lui t6, 45590
; RV32IM-NEXT:    add a7, t0, a7
; RV32IM-NEXT:    li t0, 23
; RV32IM-NEXT:    addi t6, t6, 1069
; RV32IM-NEXT:    mulhu t3, a5, t3
; RV32IM-NEXT:    add t3, t5, t3
; RV32IM-NEXT:    mulhu t6, a6, t6
; RV32IM-NEXT:    sltu t5, t3, t5
; RV32IM-NEXT:    add t3, t3, t4
; RV32IM-NEXT:    mul t0, t6, t0
; RV32IM-NEXT:    seqz t6, t3
; RV32IM-NEXT:    and t4, t6, t4
; RV32IM-NEXT:    or t4, t5, t4
; RV32IM-NEXT:    mul t5, a5, t2
; RV32IM-NEXT:    mulhu t2, a5, t2
; RV32IM-NEXT:    add s5, a7, s5
; RV32IM-NEXT:    add t5, t3, t5
; RV32IM-NEXT:    sltu a7, s5, a7
; RV32IM-NEXT:    add s5, s5, t1
; RV32IM-NEXT:    sltu t3, t5, t3
; RV32IM-NEXT:    add t2, t3, t2
; RV32IM-NEXT:    seqz t3, s5
; RV32IM-NEXT:    and t1, t3, t1
; RV32IM-NEXT:    add t2, t2, t4
; RV32IM-NEXT:    or a7, a7, t1
; RV32IM-NEXT:    li t1, 654
; RV32IM-NEXT:    add s4, s5, s4
; RV32IM-NEXT:    sltu t3, s4, s5
; RV32IM-NEXT:    add t3, t3, s1
; RV32IM-NEXT:    lui t4, 1
; RV32IM-NEXT:    addi t4, t4, 1327
; RV32IM-NEXT:    srli t5, t5, 12
; RV32IM-NEXT:    srli t6, s4, 7
; RV32IM-NEXT:    add a7, t3, a7
; RV32IM-NEXT:    srli t3, t2, 12
; RV32IM-NEXT:    slli t2, t2, 20
; RV32IM-NEXT:    mul t3, t3, t4
; RV32IM-NEXT:    or t2, t2, t5
; RV32IM-NEXT:    srli t5, a7, 7
; RV32IM-NEXT:    slli a7, a7, 25
; RV32IM-NEXT:    sub a5, a5, t3
; RV32IM-NEXT:    mulhu t3, t2, t4
; RV32IM-NEXT:    mul t2, t2, t4
; RV32IM-NEXT:    mul t4, t5, t1
; RV32IM-NEXT:    or a7, a7, t6
; RV32IM-NEXT:    sub a5, a5, t3
; RV32IM-NEXT:    sub s1, a3, t2
; RV32IM-NEXT:    mulhu t2, a7, t1
; RV32IM-NEXT:    sub a2, a2, t4
; RV32IM-NEXT:    mul a7, a7, t1
; RV32IM-NEXT:    sltu a3, a3, s1
; RV32IM-NEXT:    sub a2, a2, t2
; RV32IM-NEXT:    sub s2, a4, a7
; RV32IM-NEXT:    sub s3, a5, a3
; RV32IM-NEXT:    sltu a3, a4, s2
; RV32IM-NEXT:    sub s4, a2, a3
; RV32IM-NEXT:    sub s5, a6, t0
; RV32IM-NEXT:    li a2, 1
; RV32IM-NEXT:    mv a1, a3
; RV32IM-NEXT:    li a3, 0
; RV32IM-NEXT:    call __umoddi3
; RV32IM-NEXT:    mv s5, a0
; RV32IM-NEXT:    mv s6, a1
; RV32IM-NEXT:    li a2, 654
; RV32IM-NEXT:    mv a0, s3
; RV32IM-NEXT:    mv a1, s4
; RV32IM-NEXT:    li a3, 0
; RV32IM-NEXT:    call __umoddi3
; RV32IM-NEXT:    mv s3, a0
; RV32IM-NEXT:    mv s4, a1
; RV32IM-NEXT:    lui a2, 1
; RV32IM-NEXT:    addi a2, a2, 1327
; RV32IM-NEXT:    mv a0, s1
; RV32IM-NEXT:    mv a1, s2
; RV32IM-NEXT:    li a3, 0
; RV32IM-NEXT:    call __umoddi3
; RV32IM-NEXT:    sw s7, 16(s0)
; RV32IM-NEXT:    sw s5, 16(s0)
; RV32IM-NEXT:    sw zero, 20(s0)
; RV32IM-NEXT:    sw a0, 24(s0)
; RV32IM-NEXT:    sw a1, 28(s0)
; RV32IM-NEXT:    sw s5, 0(s0)
; RV32IM-NEXT:    sw s6, 4(s0)
; RV32IM-NEXT:    sw s3, 8(s0)
; RV32IM-NEXT:    sw s1, 24(s0)
; RV32IM-NEXT:    sw s3, 28(s0)
; RV32IM-NEXT:    sw a0, 0(s0)
; RV32IM-NEXT:    sw a1, 4(s0)
; RV32IM-NEXT:    sw s2, 8(s0)
; RV32IM-NEXT:    sw s4, 12(s0)
; RV32IM-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
; RV32IM-NEXT:    lw s0, 40(sp) # 4-byte Folded Reload
; RV32IM-NEXT:    lw s1, 36(sp) # 4-byte Folded Reload
; RV32IM-NEXT:    lw s2, 32(sp) # 4-byte Folded Reload
; RV32IM-NEXT:    lw s3, 28(sp) # 4-byte Folded Reload
; RV32IM-NEXT:    lw s4, 24(sp) # 4-byte Folded Reload
; RV32IM-NEXT:    lw s5, 20(sp) # 4-byte Folded Reload
; RV32IM-NEXT:    lw s6, 16(sp) # 4-byte Folded Reload
; RV32IM-NEXT:    lw s7, 12(sp) # 4-byte Folded Reload
; RV32IM-NEXT:    addi sp, sp, 48
; RV32IM-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
; RV32IM-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
; RV32IM-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
; RV32IM-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
; RV32IM-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
; RV32IM-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
; RV32IM-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
; RV32IM-NEXT:    addi sp, sp, 32
; RV32IM-NEXT:    ret
;
; RV64I-LABEL: dont_fold_urem_i64:
; RV64I-LABEL: fold_urem_i64:
; RV64I:       # %bb.0:
; RV64I-NEXT:    addi sp, sp, -48
; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
@@ -962,7 +1025,7 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind {
; RV64I-NEXT:    addi sp, sp, 48
; RV64I-NEXT:    ret
;
; RV64IM-LABEL: dont_fold_urem_i64:
; RV64IM-LABEL: fold_urem_i64:
; RV64IM:       # %bb.0:
; RV64IM-NEXT:    ld a2, 8(a1)
; RV64IM-NEXT:    ld a3, 16(a1)
+475 −30

File changed.

Preview size limit exceeded, changes collapsed.

+517 −0

File changed.

Preview size limit exceeded, changes collapsed.

Loading