Unverified Commit c1b347ea authored by Alexey Bataev's avatar Alexey Bataev Committed by GitHub
Browse files

[SLP] Refine loop-aware gather cost and admit sibling-loop subtrees

Factor the loop-nest walk out of getScaleToLoopIterations() into a pure
getLoopNestScale(). Reused by a new getGatherNodeEffectiveScale()
(-slp-per-lane-gather-scale, on by default) that averages per-lane
scales so LICM-hoistable gather operands no longer pay the inner loop's
trip count. Also, relax the tree-builder's loop-nest guard to admit
sibling inner loops sharing a common outer loop only when SCEV proves
equal backedge-taken counts.

Reviewers: hiraditya, RKSimon, bababuck

Pull Request: https://github.com/llvm/llvm-project/pull/192801
parent ba2f42d5
Loading
Loading
Loading
Loading
+181 −47
Original line number Diff line number Diff line
@@ -259,6 +259,18 @@ static cl::opt<unsigned> LoopAwareTripCount(
    cl::desc("Loop trip count, considered by the cost model during "
             "modeling (0=loops are ignored and considered flat code)"));
/// Refine the loop-aware cost scaling of gather/buildvector tree entries by
/// using the per-lane execution scale of the operand that feeds each lane,
/// instead of a single whole-entry scale. This matches the LICM hoisting
/// performed by optimizeGatherSequence() at codegen time: lanes whose
/// operands are loop-invariant in an inner loop contribute the outer loop's
/// execution scale rather than the inner loop's, which avoids over-costing
/// buildvectors that bridge values from outer loop nests into an inner loop.
static cl::opt<bool> PerLaneGatherScale(
    "slp-per-lane-gather-scale", cl::init(true), cl::Hidden,
    cl::desc("Use per-lane execution scale for gather/buildvector tree "
             "entries to model LICM-hoistable buildvector sequences."));
// Limit the number of alias checks. The limit is chosen so that
// it has no negative effect on the llvm benchmarks.
static const unsigned AliasedCheckLimit = 10;
@@ -3942,10 +3954,27 @@ private:
  /// external use.
  /// \p U is the user of the vectorized value from the entry, if using the
  /// parent for the external use.
  unsigned getScaleToLoopIterations(const TreeEntry &TE,
  uint64_t getScaleToLoopIterations(const TreeEntry &TE,
                                    Value *Scalar = nullptr,
                                    Instruction *U = nullptr);
  /// \returns the product of trip counts of the loop \p L and all of its
  /// enclosing loops. Unlike the state kept by getScaleToLoopIterations(),
  /// this helper depends only on the loop structure and is independent of
  /// per-entry operand invariance. Returns 1 when loop-aware cost modeling
  /// is disabled or \p L is null.
  uint64_t getLoopNestScale(const Loop *L);
  /// \returns a refined execution scale for a gather/buildvector tree entry
  /// \p TE. The scale is computed as the average of per-lane execution
  /// scales: each lane's scale is the loop-nest scale of the loop that
  /// contains the lane's defining instruction (or 1 if the lane is a
  /// constant / loop-invariant non-instruction value). This models the
  /// LICM hoisting that optimizeGatherSequence() performs after vectorization
  /// for inserts with loop-invariant operands. Falls back to the whole-entry
  /// scale when per-lane information is unavailable or the feature is off.
  uint64_t getGatherNodeEffectiveScale(const TreeEntry &TE);
  /// Get the loop nest for the given loop \p L.
  ArrayRef<const Loop *> getLoopNest(const Loop *L);
@@ -4904,9 +4933,10 @@ private:
  /// Maps the loops to their loop nests.
  SmallDenseMap<const Loop *, SmallVector<const Loop *>> LoopToLoopNest;
  /// Maps the loops to their scale factor, which is built as a multiplication
  /// of the tripcounts of the loops in the loop nest.
  SmallDenseMap<const Loop *, unsigned> LoopToScaleFactor;
  /// Per-loop cache of nest scale factors: the product of trip counts of the
  /// loop and all of its ancestors. Shared by getLoopNestScale() and (via it)
  /// by getScaleToLoopIterations() and getGatherNodeEffectiveScale().
  SmallDenseMap<const Loop *, uint64_t> LoopNestScaleCache;
  /// This POD struct describes one external user in the vectorized tree.
  struct ExternalUser {
@@ -12645,22 +12675,56 @@ void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
                 S.getMainOp()->getParent()) {
    BasicBlock *Parent = S.getMainOp()->getParent();
    if (const Loop *L = LI->getLoopFor(Parent)) {
      // Check that the new loop nest is not involved.
      // Otherwise, mark it as a gather node.
      // Check that the new loop nest shares the same outer structure as the
      // tree's current loop nest. Completely disjoint nests (different
      // outermost loops) are forced to gather because their scales cannot be
      // meaningfully combined. Sibling inner loops (inside a common outer
      // loop or outside any loops at all) are allowed: the cost model scales
      // each entry by its own loop via getScaleToLoopIterations(), so a tree
      // that spans sibling inner loops (e.g. a PHI at their merge block) can
      // still be costed correctly. Contract CurrentLoopNest to the longest
      // common prefix with the new entry's nest so subsequent entries in yet
      // another sibling can also be admitted.
      L = findInnermostNonInvariantLoop(L, VL);
      if (L) {
        SmallVector<const Loop *> NewLoopNest(getLoopNest(L));
        unsigned CommonLen = 0;
        for (const auto [L1, L2] : zip(CurrentLoopNest, NewLoopNest)) {
          if (L1 != L2) {
            LLVM_DEBUG(dbgs() << "SLP: Different loop nest.\n");
          if (L1 != L2)
            break;
          ++CommonLen;
        }
        if (CurrentLoopNest.empty()) {
          CurrentLoopNest.assign(NewLoopNest);
        } else if (CommonLen < CurrentLoopNest.size() &&
                   CommonLen < NewLoopNest.size()) {
          // Divergence below the common prefix: the tree now spans sibling
          // loops at depth CommonLen. Admitting them into one tree makes
          // the profitability decision JOINT across both siblings, so a
          // very hot sibling could otherwise let an unprofitable cold
          // sibling ride along "for free" (per-entry scaling of the cold
          // sibling's entries would be dwarfed by the hot one). Require
          // SCEV-proven equal backedge-taken counts for the diverging
          // siblings before joining; otherwise force gather.
          const Loop *SibA = CurrentLoopNest[CommonLen];
          const Loop *SibB = NewLoopNest[CommonLen];
          const SCEV *BecA = SE->getBackedgeTakenCount(SibA);
          const SCEV *BecB = SE->getBackedgeTakenCount(SibB);
          if (isa<SCEVCouldNotCompute>(BecA) || BecA != BecB) {
            LLVM_DEBUG(dbgs()
                       << "SLP: Sibling loops have different trip counts.\n");
            newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
            return;
          }
        }
        if (NewLoopNest.size() > CurrentLoopNest.size())
          CurrentLoopNest.append(std::next(NewLoopNest.begin(), CurrentLoopNest.size()),
          CurrentLoopNest.truncate(CommonLen);
        } else if (NewLoopNest.size() > CurrentLoopNest.size()) {
          // New entry lives deeper in the same nest chain; extend.
          CurrentLoopNest.append(
              std::next(NewLoopNest.begin(), CurrentLoopNest.size()),
              NewLoopNest.end());
        }
        // Otherwise NewLoopNest is a prefix of CurrentLoopNest: keep as-is.
      }
    }
  }
@@ -16069,7 +16133,7 @@ static unsigned getLoopTripCount(const Loop *L, ScalarEvolution &SE) {
  return LoopAwareTripCount;
}
unsigned BoUpSLP::getScaleToLoopIterations(const TreeEntry &TE, Value *Scalar,
uint64_t BoUpSLP::getScaleToLoopIterations(const TreeEntry &TE, Value *Scalar,
                                           Instruction *U) {
  BasicBlock *Parent = nullptr;
  if (U) {
@@ -16096,31 +16160,90 @@ unsigned BoUpSLP::getScaleToLoopIterations(const TreeEntry &TE, Value *Scalar,
  } else {
    Parent = TE.getMainOp()->getParent();
  }
  if (const Loop *L = LI->getLoopFor(Parent)) {
    const auto It = LoopToScaleFactor.find(L);
    if (It != LoopToScaleFactor.end())
  const Loop *L = LI->getLoopFor(Parent);
  if (!L)
    return 1;
  // The entry's cost is paid once per execution of the innermost loop in
  // which some of its operands are variant. Operands that are invariant in
  // all enclosing loops are executed once (LICM will hoist them out).
  return getLoopNestScale(findInnermostNonInvariantLoop(
      L, Scalar ? ArrayRef(Scalar) : ArrayRef(TE.Scalars)));
}
uint64_t BoUpSLP::getLoopNestScale(const Loop *L) {
  if (!L || LoopAwareTripCount == 0)
    return 1;
  if (auto It = LoopNestScaleCache.find(L); It != LoopNestScaleCache.end())
    return It->second;
    unsigned Scale = 1;
    if (const Loop *NonInvL = findInnermostNonInvariantLoop(
            L, Scalar ? ArrayRef(Scalar) : ArrayRef(TE.Scalars))) {
      Scale = getLoopTripCount(NonInvL, *SE);
      for (const Loop *LN : getLoopNest(NonInvL)) {
        if (LN == L)
  // Collect loops from L outward up to (but not including) the first cached
  // ancestor or the function top, then walk back inward multiplying trip
  // counts. Use uint64_t to avoid silent overflow on deep/large nests.
  SmallVector<const Loop *> Chain;
  for (const Loop *Cur = L; Cur; Cur = Cur->getParentLoop()) {
    if (LoopNestScaleCache.contains(Cur))
      break;
        auto LNRes = LoopToScaleFactor.try_emplace(LN, 0);
        auto &LoopScale = LNRes.first->getSecond();
        if (!LNRes.second) {
          Scale *= LoopScale;
          break;
        }
        Scale *= getLoopTripCount(LN, *SE);
        LoopScale = Scale;
      }
    }
    LoopToScaleFactor.try_emplace(L, Scale);
    return Scale;
    Chain.push_back(Cur);
  }
  assert(!Chain.empty() && "Early-return above should have handled cache hit.");
  uint64_t Scale = 1;
  if (const Loop *Parent = Chain.back()->getParentLoop())
    Scale = LoopNestScaleCache.lookup(Parent);
  // Walk from the outermost uncached loop inward, accumulating trip counts.
  // Use SaturatingMultiply to clamp at uint64_t max on deep/large nests
  // rather than wrapping around.
  for (const Loop *Cur : reverse(Chain)) {
    uint64_t TC = std::max<uint64_t>(1, getLoopTripCount(Cur, *SE));
    Scale = SaturatingMultiply(Scale, TC);
    LoopNestScaleCache.try_emplace(Cur, std::max<uint64_t>(1, Scale));
  }
  return std::max<uint64_t>(1, Scale);
}
uint64_t BoUpSLP::getGatherNodeEffectiveScale(const TreeEntry &TE) {
  // Only meaningful for gather/buildvector-like entries; the per-lane
  // insertelements that make up such an entry are LICM-hoistable by
  // optimizeGatherSequence() when their operand is loop-invariant.
  assert((TE.isGather() || TE.State == TreeEntry::SplitVectorize) &&
         "Expected gather/split tree entry.");
  uint64_t BaseScale = getScaleToLoopIterations(TE);
  if (!PerLaneGatherScale || LoopAwareTripCount == 0 || BaseScale <= 1)
    return BaseScale;
  // Average the per-lane execution scales: for each lane, reuse the same
  // scale helper the rest of the cost model uses, but ask it about that
  // one lane's value. Lanes that are loop-invariant in the current nest
  // collapse to their outer-loop scale (or 1 for fully invariant/constant
  // lanes), which matches the LICM hoisting performed by
  // optimizeGatherSequence(). Cap per-lane contributions by BaseScale so a
  // refinement can never raise the cost above the whole-entry scale.
  // Each lane contributes at most BaseScale, so Sum is bounded above by
  // N * BaseScale. If BaseScale is near uint64_t max (saturated by
  // getLoopNestScale on a deep nest) Sum can still overflow uint64_t,
  // which would silently wrap and produce a wrong average. Use
  // SaturatingAdd and bail out to BaseScale on overflow: the true average
  // is bounded above by BaseScale anyway, so this preserves the
  // refinement's invariant that it can never raise cost.
  uint64_t Sum = 0;
  unsigned N = 0;
  bool Overflow = false;
  for (Value *V : TE.Scalars) {
    if (isConstant(V))
      continue;
    ++N;
    uint64_t LaneScale = std::min(getScaleToLoopIterations(TE, V), BaseScale);
    Sum = SaturatingAdd(Sum, LaneScale, &Overflow);
    if (Overflow)
      return BaseScale;
  }
  return 1;
  if (N == 0)
    return BaseScale;
  // Ceil-divide so we never round the effective scale down below 1.
  uint64_t Numerator = SaturatingAdd(Sum, uint64_t(N - 1), &Overflow);
  if (Overflow)
    return BaseScale;
  uint64_t Avg = Numerator / N;
  return std::clamp<uint64_t>(Avg, 1, BaseScale);
}
InstructionCost
@@ -18025,7 +18148,7 @@ InstructionCost BoUpSLP::getSpillCost() {
    if (It != MinBWs.end())
      ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first);
    auto *VecTy = getWidenedType(ScalarTy, Op->getVectorFactor());
    unsigned Scale = getScaleToLoopIterations(*Op);
    uint64_t Scale = getScaleToLoopIterations(*Op);
    InstructionCost KeepLiveCost = TTI->getCostOfKeepingLiveOverCall(VecTy);
    KeepLiveCost *= Scale;
    Cost += KeepLiveCost;
@@ -18462,8 +18585,8 @@ InstructionCost BoUpSLP::calculateTreeCostAndTrimNonProfitable(
  };
  constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
  InstructionCost Cost = 0;
  SmallDenseMap<const TreeEntry *, unsigned> EntryToScale;
  unsigned PrevScale = 0;
  SmallDenseMap<const TreeEntry *, uint64_t> EntryToScale;
  uint64_t PrevScale = 0;
  BasicBlock *PrevVecParent = nullptr;
  for (const std::unique_ptr<TreeEntry> &Ptr : VectorizableTree) {
    TreeEntry &TE = *Ptr;
@@ -18498,8 +18621,14 @@ InstructionCost BoUpSLP::calculateTreeCostAndTrimNonProfitable(
           "Expected gather nodes with users only.");
    InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
    unsigned Scale = 0;
    uint64_t Scale = 0;
    bool CostIsFree = C == 0;
    // For gather/buildvector (and split-vectorize) entries, prefer the
    // per-lane refined scale that accounts for LICM-hoistable insertelements
    // when an operand is invariant in the current loop nest but defined in
    // an outer loop. This prevents over-costing cross-loop-nest buildvectors.
    const bool IsGatherLike =
        TE.isGather() || TE.State == TreeEntry::SplitVectorize;
    if (!CostIsFree && !TE.isGather() && TE.hasState()) {
      if (PrevVecParent == TE.getMainOp()->getParent()) {
        Scale = PrevScale;
@@ -18508,7 +18637,8 @@ InstructionCost BoUpSLP::calculateTreeCostAndTrimNonProfitable(
      }
    }
    if (!CostIsFree && !Scale) {
      Scale = getScaleToLoopIterations(TE);
      Scale = IsGatherLike ? getGatherNodeEffectiveScale(TE)
                           : getScaleToLoopIterations(TE);
      C *= Scale;
      EntryToScale.try_emplace(&TE, Scale);
      if (!TE.isGather() && TE.hasState()) {
@@ -18872,9 +19002,13 @@ InstructionCost BoUpSLP::calculateTreeCostAndTrimNonProfitable(
        NodesCosts.try_emplace(TE.get(), C);
        continue;
      }
      unsigned Scale = EntryToScale.lookup(TE.get());
      if (!Scale)
        Scale = getScaleToLoopIterations(*TE.get());
      uint64_t Scale = EntryToScale.lookup(TE.get());
      if (!Scale) {
        const bool IsGatherLike =
            TE->isGather() || TE->State == TreeEntry::SplitVectorize;
        Scale = IsGatherLike ? getGatherNodeEffectiveScale(*TE.get())
                             : getScaleToLoopIterations(*TE.get());
      }
      C *= Scale;
      NodesCosts.try_emplace(TE.get(), C);
    }
@@ -18952,13 +19086,13 @@ InstructionCost BoUpSLP::getTreeCost(InstructionCost TreeCost,
  }
  InstructionCost Cost = TreeCost;
  SmallDenseMap<std::tuple<const TreeEntry *, Value *, Instruction *>, unsigned>
  SmallDenseMap<std::tuple<const TreeEntry *, Value *, Instruction *>, uint64_t>
      EntryToScale;
  auto ScaleCost = [&](InstructionCost C, const TreeEntry &TE,
                       Value *Scalar = nullptr, Instruction *U = nullptr) {
    if (!C.isValid() || C == 0)
      return C;
    unsigned &Scale =
    uint64_t &Scale =
        EntryToScale.try_emplace(std::make_tuple(&TE, Scalar, U), 0)
            .first->getSecond();
    if (!Scale)
+122 −151

File changed.

Preview size limit exceeded, changes collapsed.

+7 −4
Original line number Diff line number Diff line
@@ -36,7 +36,7 @@
; YAML-NEXT: Function:        getelementptr_4x32
; YAML-NEXT: Args:
; YAML-NEXT:   - String:          'SLP vectorized with cost '
; YAML-NEXT:   - Cost:            '12'
; YAML-NEXT:   - Cost:            '10'
; YAML-NEXT:   - String:          ' and with tree size '
; YAML-NEXT:   - TreeSize:        '3'

@@ -47,6 +47,8 @@ define i32 @getelementptr_4x32(ptr nocapture readonly %g, i32 %n, i32 %x, i32 %y
; CHECK-NEXT:    br i1 [[CMP31]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
; CHECK:       for.body.preheader:
; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> <i32 0, i32 poison>, i32 [[X:%.*]], i32 1
; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[Y:%.*]], i32 0
; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[Z:%.*]], i32 1
; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
; CHECK:       for.cond.cleanup.loopexit:
; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
@@ -66,10 +68,11 @@ define i32 @getelementptr_4x32(ptr nocapture readonly %g, i32 %n, i32 %x, i32 %y
; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i32> [[TMP3]], i32 1
; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[G]], i32 [[TMP11]]
; CHECK-NEXT:    [[T8:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
; CHECK-NEXT:    [[TMP13:%.*]] = add nsw i32 [[T4]], [[Y:%.*]]
; CHECK-NEXT:    [[TMP16:%.*]] = add nsw <2 x i32> [[TMP2]], [[TMP5]]
; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i32> [[TMP16]], i32 0
; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, ptr [[G]], i32 [[TMP13]]
; CHECK-NEXT:    [[T10:%.*]] = load i32, ptr [[ARRAYIDX10]], align 4
; CHECK-NEXT:    [[TMP14:%.*]] = add nsw i32 [[T4]], [[Z:%.*]]
; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i32> [[TMP16]], i32 1
; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[G]], i32 [[TMP14]]
; CHECK-NEXT:    [[T12:%.*]] = load i32, ptr [[ARRAYIDX15]], align 4
; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[T6]], i32 0
@@ -128,7 +131,7 @@ for.body:
; YAML:      Function:        getelementptr_2x32
; YAML:     Args:
; YAML:        - String:          'SLP vectorized with cost '
; YAML:        - Cost:            '12'
; YAML:        - Cost:            '10'
; YAML-NEXT:   - String:          ' and with tree size '
; YAML-NEXT:   - TreeSize:        '3'

+9 −10
Original line number Diff line number Diff line
@@ -207,22 +207,21 @@ define float @slp_not_profitable_in_loop(float %x, ptr %A) {
; CHECK-LABEL: @slp_not_profitable_in_loop(
; CHECK-NEXT:  entry:
; CHECK-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 1
; CHECK-NEXT:    [[L_0:%.*]] = load float, ptr [[GEP_A_1]], align 4
; CHECK-NEXT:    [[A1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 2
; CHECK-NEXT:    [[L_2:%.*]] = load float, ptr [[A1]], align 4
; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[GEP_A_1]], align 4
; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x float> [[TMP0]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
; CHECK-NEXT:    [[L_3:%.*]] = load float, ptr [[A]], align 4
; CHECK-NEXT:    [[L_4:%.*]] = load float, ptr [[A]], align 4
; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> <float 3.000000e+00, float 3.000000e+00, float poison, float 3.000000e+00>, float [[X:%.*]], i32 2
; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> poison, float [[L_3]], i32 2
; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[L_4]], i32 3
; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
; CHECK-NEXT:    br label [[LOOP:%.*]]
; CHECK:       loop:
; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
; CHECK-NEXT:    [[RED:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[RED_NEXT:%.*]], [[LOOP]] ]
; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast float 3.000000e+00, [[L_0]]
; CHECK-NEXT:    [[MUL12:%.*]] = fmul fast float 3.000000e+00, [[L_2]]
; CHECK-NEXT:    [[TMP4:%.*]] = fmul fast float [[X:%.*]], [[L_3]]
; CHECK-NEXT:    [[MUL16:%.*]] = fmul fast float 3.000000e+00, [[L_4]]
; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[MUL12]], [[TMP3]]
; CHECK-NEXT:    [[ADD13:%.*]] = fadd fast float [[ADD]], [[TMP4]]
; CHECK-NEXT:    [[RED_NEXT]] = fadd fast float [[ADD13]], [[MUL16]]
; CHECK-NEXT:    [[TMP7:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP6]]
; CHECK-NEXT:    [[RED_NEXT]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP7]])
; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV]], 10
; CHECK-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]]
+1 −1
Original line number Diff line number Diff line
@@ -73,7 +73,7 @@ define void @fun1(double %0) {
; REMARK-LABEL: Function: fun1
; REMARK: Args:
; REMARK:      - String:          'SLP vectorized with cost '
; REMARK-NEXT: - Cost:            '-2'
; REMARK-NEXT: - Cost:            '-3'

  br label %2

Loading