Commit 6c2a4f5f authored by Sjoerd Meijer's avatar Sjoerd Meijer
Browse files

[TTI][LV] preferPredicateOverEpilogue

We have two ways to steer creating a predicated vector body over creating a
scalar epilogue. To force this, we have 1) a command line option and 2) a
pragma available. This adds a third: a target hook to TargetTransformInfo that
can be queried whether predication is preferred or not, which allows the
vectoriser to make the decision without forcing it.

While this change behaves as a non-functional change for now, it shows the
required TTI plumbing, usage of this new hook in the vectoriser, and the
beginning of an ARM MVE implementation. I will follow up on this with:
- a complete MVE implementation, see D69845.
- a patch to disable this, i.e. we should respect "vector_predicate(disable)"
  and its corresponding loophint.

Differential Revision: https://reviews.llvm.org/D69040
parent 9577ee84
Loading
Loading
Loading
Loading
+20 −0
Original line number Diff line number Diff line
@@ -46,6 +46,7 @@ class Function;
class GlobalValue;
class IntrinsicInst;
class LoadInst;
class LoopAccessInfo;
class Loop;
class ProfileSummaryInfo;
class SCEV;
@@ -518,6 +519,13 @@ public:
                                TargetLibraryInfo *LibInfo,
                                HardwareLoopInfo &HWLoopInfo) const;

  /// Query the target whether it would be prefered to create a predicated vector
  /// loop, which can avoid the need to emit a scalar epilogue loop.
  bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
                                   AssumptionCache &AC, TargetLibraryInfo *TLI,
                                   DominatorTree *DT,
                                   const LoopAccessInfo *LAI) const;

  /// @}

  /// \name Scalar Target Information
@@ -1201,6 +1209,12 @@ public:
                                        AssumptionCache &AC,
                                        TargetLibraryInfo *LibInfo,
                                        HardwareLoopInfo &HWLoopInfo) = 0;
  virtual bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI,
                                           ScalarEvolution &SE,
                                           AssumptionCache &AC,
                                           TargetLibraryInfo *TLI,
                                           DominatorTree *DT,
                                           const LoopAccessInfo *LAI) = 0;
  virtual bool isLegalAddImmediate(int64_t Imm) = 0;
  virtual bool isLegalICmpImmediate(int64_t Imm) = 0;
  virtual bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV,
@@ -1471,6 +1485,12 @@ public:
                                HardwareLoopInfo &HWLoopInfo) override {
    return Impl.isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
  }
  bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
                                   AssumptionCache &AC, TargetLibraryInfo *TLI,
                                   DominatorTree *DT,
                                   const LoopAccessInfo *LAI) override {
    return Impl.preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI);
  }
  bool isLegalAddImmediate(int64_t Imm) override {
    return Impl.isLegalAddImmediate(Imm);
  }
+7 −0
Original line number Diff line number Diff line
@@ -213,6 +213,13 @@ public:
    return false;
  }

  bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
                                   AssumptionCache &AC, TargetLibraryInfo *TLI,
                                   DominatorTree *DT,
                                   const LoopAccessInfo *LAI) const {
    return false;
  }

  void getUnrollingPreferences(Loop *, ScalarEvolution &,
                               TTI::UnrollingPreferences &) {}

+7 −0
Original line number Diff line number Diff line
@@ -510,6 +510,13 @@ public:
    return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
  }

  bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
                                   AssumptionCache &AC, TargetLibraryInfo *TLI,
                                   DominatorTree *DT,
                                   const LoopAccessInfo *LAI) {
    return BaseT::preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI);
  }

  int getInstructionLatency(const Instruction *I) {
    if (isa<LoadInst>(I))
      return getST()->getSchedModel().DefaultLoadLatency;
+6 −0
Original line number Diff line number Diff line
@@ -243,6 +243,12 @@ bool TargetTransformInfo::isHardwareLoopProfitable(
  return TTIImpl->isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
}

bool TargetTransformInfo::preferPredicateOverEpilogue(Loop *L, LoopInfo *LI,
    ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *TLI,
    DominatorTree *DT, const LoopAccessInfo *LAI) const {
  return TTIImpl->preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI);
}

void TargetTransformInfo::getUnrollingPreferences(
    Loop *L, ScalarEvolution &SE, UnrollingPreferences &UP) const {
  return TTIImpl->getUnrollingPreferences(L, SE, UP);
+44 −0
Original line number Diff line number Diff line
@@ -1000,6 +1000,50 @@ bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
  return true;
}

bool ARMTTIImpl::preferPredicateOverEpilogue(Loop *L, LoopInfo *LI,
                                             ScalarEvolution &SE,
                                             AssumptionCache &AC,
                                             TargetLibraryInfo *TLI,
                                             DominatorTree *DT,
                                             const LoopAccessInfo *LAI) {
  // Creating a predicated vector loop is the first step for generating a
  // tail-predicated hardware loop, for which we need the MVE masked
  // load/stores instructions:
  if (!ST->hasMVEIntegerOps())
    return false;

  HardwareLoopInfo HWLoopInfo(L);
  if (!HWLoopInfo.canAnalyze(*LI)) {
    LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
                         "analyzable.\n");
    return false;
  }

  // This checks if we have the low-overhead branch architecture
  // extension, and if we will create a hardware-loop:
  if (!isHardwareLoopProfitable(L, SE, AC, TLI, HWLoopInfo)) {
    LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
                         "profitable.\n");
    return false;
  }

  if (!HWLoopInfo.isHardwareLoopCandidate(SE, *LI, *DT)) {
    LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
                         "a candidate.\n");
    return false;
  }

  // TODO: to set up a tail-predicated loop, which works by setting up
  // the total number of elements processed by the loop, we need to
  // determine the element size here, and if it is uniform for all operations
  // in the vector loop. This means we will reject narrowing/widening
  // operations, and don't want to predicate the vector loop, which is
  // the main prep step for tail-predicated loops.

  return false;
}


void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                         TTI::UnrollingPreferences &UP) {
  // Only currently enable these preferences for M-Class cores.
Loading