Commit 6a18a954 authored by Vasileios Porpodas's avatar Vasileios Porpodas
Browse files

[SLP] Look-ahead operand reordering heuristic.

Summary: This patch introduces a new heuristic for guiding operand reordering. The new "look-ahead" heuristic can look beyond the immediate predecessors. This helps break ties when the immediate predecessors have identical opcodes (see lit test for examples).

Reviewers: RKSimon, ABataev, dtemirbulatov, Ayal, hfinkel, rnk

Reviewed By: RKSimon, dtemirbulatov

Subscribers: xbolva00, Carrot, hiraditya, phosek, rnk, rcorcs, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D60897
parent ac385ca6
Loading
Loading
Loading
Loading
+260 −46
Original line number Diff line number Diff line
@@ -147,6 +147,20 @@ static cl::opt<unsigned> MinTreeSize(
    "slp-min-tree-size", cl::init(3), cl::Hidden,
    cl::desc("Only vectorize small trees if they are fully vectorizable"));

// The maximum depth that the look-ahead score heuristic will explore.
// The higher this value, the higher the compilation time overhead.
static cl::opt<int> LookAheadMaxDepth(
    "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
    cl::desc("The maximum look-ahead depth for operand reordering scores"));

// The Look-ahead heuristic goes through the users of the bundle to calculate
// the users cost in getExternalUsesCost(). To avoid compilation time increase
// we limit the number of users visited to this value.
static cl::opt<unsigned> LookAheadUsersBudget(
    "slp-look-ahead-users-budget", cl::init(2), cl::Hidden,
    cl::desc("The maximum number of users to visit while visiting the "
             "predecessors. This prevents compilation time increase."));

static cl::opt<bool>
    ViewSLPTree("view-slp-tree", cl::Hidden,
                cl::desc("Display the SLP trees with Graphviz"));
@@ -721,6 +735,7 @@ public:

    const DataLayout &DL;
    ScalarEvolution &SE;
    const BoUpSLP &R;

    /// \returns the operand data at \p OpIdx and \p Lane.
    OperandData &getData(unsigned OpIdx, unsigned Lane) {
@@ -746,6 +761,227 @@ public:
      std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
    }

    // The hard-coded scores listed here are not very important. When computing
    // the scores of matching one sub-tree with another, we are basically
    // counting the number of values that are matching. So even if all scores
    // are set to 1, we would still get a decent matching result.
    // However, sometimes we have to break ties. For example we may have to
    // choose between matching loads vs matching opcodes. This is what these
    // scores are helping us with: they provide the order of preference.

    /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
    static const int ScoreConsecutiveLoads = 3;
    /// ExtractElementInst from same vector and consecutive indexes.
    static const int ScoreConsecutiveExtracts = 3;
    /// Constants.
    static const int ScoreConstants = 2;
    /// Instructions with the same opcode.
    static const int ScoreSameOpcode = 2;
    /// Instructions with alt opcodes (e.g, add + sub).
    static const int ScoreAltOpcodes = 1;
    /// Identical instructions (a.k.a. splat or broadcast).
    static const int ScoreSplat = 1;
    /// Matching with an undef is preferable to failing.
    static const int ScoreUndef = 1;
    /// Score for failing to find a decent match.
    static const int ScoreFail = 0;
    /// User exteranl to the vectorized code.
    static const int ExternalUseCost = 1;
    /// The user is internal but in a different lane.
    static const int UserInDiffLaneCost = ExternalUseCost;

    /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
    static int getShallowScore(Value *V1, Value *V2, const DataLayout &DL,
                               ScalarEvolution &SE) {
      auto *LI1 = dyn_cast<LoadInst>(V1);
      auto *LI2 = dyn_cast<LoadInst>(V2);
      if (LI1 && LI2)
        return isConsecutiveAccess(LI1, LI2, DL, SE)
                   ? VLOperands::ScoreConsecutiveLoads
                   : VLOperands::ScoreFail;

      auto *C1 = dyn_cast<Constant>(V1);
      auto *C2 = dyn_cast<Constant>(V2);
      if (C1 && C2)
        return VLOperands::ScoreConstants;

      // Extracts from consecutive indexes of the same vector better score as
      // the extracts could be optimized away.
      auto *Ex1 = dyn_cast<ExtractElementInst>(V1);
      auto *Ex2 = dyn_cast<ExtractElementInst>(V2);
      if (Ex1 && Ex2 && Ex1->getVectorOperand() == Ex2->getVectorOperand() &&
          cast<ConstantInt>(Ex1->getIndexOperand())->getZExtValue() + 1 ==
              cast<ConstantInt>(Ex2->getIndexOperand())->getZExtValue()) {
        return VLOperands::ScoreConsecutiveExtracts;
      }

      auto *I1 = dyn_cast<Instruction>(V1);
      auto *I2 = dyn_cast<Instruction>(V2);
      if (I1 && I2) {
        if (I1 == I2)
          return VLOperands::ScoreSplat;
        InstructionsState S = getSameOpcode({I1, I2});
        // Note: Only consider instructions with <= 2 operands to avoid
        // complexity explosion.
        if (S.getOpcode() && S.MainOp->getNumOperands() <= 2)
          return S.isAltShuffle() ? VLOperands::ScoreAltOpcodes
                                  : VLOperands::ScoreSameOpcode;
      }

      if (isa<UndefValue>(V2))
        return VLOperands::ScoreUndef;

      return VLOperands::ScoreFail;
    }

    /// Holds the values and their lane that are taking part in the look-ahead
    /// score calculation. This is used in the external uses cost calculation.
    SmallDenseMap<Value *, int> InLookAheadValues;

    /// \Returns the additinal cost due to uses of \p LHS and \p RHS that are
    /// either external to the vectorized code, or require shuffling.
    int getExternalUsesCost(const std::pair<Value *, int> &LHS,
                            const std::pair<Value *, int> &RHS) {
      int Cost = 0;
      SmallVector<std::pair<Value *, int>, 2> Values = {LHS, RHS};
      for (int Idx = 0, IdxE = Values.size(); Idx != IdxE; ++Idx) {
        Value *V = Values[Idx].first;
        // Calculate the absolute lane, using the minimum relative lane of LHS
        // and RHS as base and Idx as the offset.
        int Ln = std::min(LHS.second, RHS.second) + Idx;
        assert(Ln >= 0 && "Bad lane calculation");
        unsigned UsersBudget = LookAheadUsersBudget;
        for (User *U : V->users()) {
          if (const TreeEntry *UserTE = R.getTreeEntry(U)) {
            // The user is in the VectorizableTree. Check if we need to insert.
            auto It = llvm::find(UserTE->Scalars, U);
            assert(It != UserTE->Scalars.end() && "U is in UserTE");
            int UserLn = std::distance(UserTE->Scalars.begin(), It);
            assert(UserLn >= 0 && "Bad lane");
            if (UserLn != Ln)
              Cost += UserInDiffLaneCost;
          } else {
            // Check if the user is in the look-ahead code.
            auto It2 = InLookAheadValues.find(U);
            if (It2 != InLookAheadValues.end()) {
              // The user is in the look-ahead code. Check the lane.
              if (It2->second != Ln)
                Cost += UserInDiffLaneCost;
            } else {
              // The user is neither in SLP tree nor in the look-ahead code.
              Cost += ExternalUseCost;
            }
          }
          // Limit the number of visited uses to cap compilation time.
          if (--UsersBudget == 0)
            break;
        }
      }
      return Cost;
    }

    /// Go through the operands of \p LHS and \p RHS recursively until \p
    /// MaxLevel, and return the cummulative score. For example:
    /// \verbatim
    ///  A[0]  B[0]  A[1]  B[1]  C[0] D[0]  B[1] A[1]
    ///     \ /         \ /         \ /        \ /
    ///      +           +           +          +
    ///     G1          G2          G3         G4
    /// \endverbatim
    /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
    /// each level recursively, accumulating the score. It starts from matching
    /// the additions at level 0, then moves on to the loads (level 1). The
    /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
    /// {B[0],B[1]} match with VLOperands::ScoreConsecutiveLoads, while
    /// {A[0],C[0]} has a score of VLOperands::ScoreFail.
    /// Please note that the order of the operands does not matter, as we
    /// evaluate the score of all profitable combinations of operands. In
    /// other words the score of G1 and G4 is the same as G1 and G2. This
    /// heuristic is based on ideas described in:
    ///   Look-ahead SLP: Auto-vectorization in the presence of commutative
    ///   operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
    ///   Luís F. W. Góes
    int getScoreAtLevelRec(const std::pair<Value *, int> &LHS,
                           const std::pair<Value *, int> &RHS, int CurrLevel,
                           int MaxLevel) {

      Value *V1 = LHS.first;
      Value *V2 = RHS.first;
      // Get the shallow score of V1 and V2.
      int ShallowScoreAtThisLevel =
          std::max((int)ScoreFail, getShallowScore(V1, V2, DL, SE) -
                                       getExternalUsesCost(LHS, RHS));
      int Lane1 = LHS.second;
      int Lane2 = RHS.second;

      // If reached MaxLevel,
      //  or if V1 and V2 are not instructions,
      //  or if they are SPLAT,
      //  or if they are not consecutive, early return the current cost.
      auto *I1 = dyn_cast<Instruction>(V1);
      auto *I2 = dyn_cast<Instruction>(V2);
      if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
          ShallowScoreAtThisLevel == VLOperands::ScoreFail ||
          (isa<LoadInst>(I1) && isa<LoadInst>(I2) && ShallowScoreAtThisLevel))
        return ShallowScoreAtThisLevel;
      assert(I1 && I2 && "Should have early exited.");

      // Keep track of in-tree values for determining the external-use cost.
      InLookAheadValues[V1] = Lane1;
      InLookAheadValues[V2] = Lane2;

      // Contains the I2 operand indexes that got matched with I1 operands.
      SmallSet<unsigned, 4> Op2Used;

      // Recursion towards the operands of I1 and I2. We are trying all possbile
      // operand pairs, and keeping track of the best score.
      for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
           OpIdx1 != NumOperands1; ++OpIdx1) {
        // Try to pair op1I with the best operand of I2.
        int MaxTmpScore = 0;
        unsigned MaxOpIdx2 = 0;
        bool FoundBest = false;
        // If I2 is commutative try all combinations.
        unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
        unsigned ToIdx = isCommutative(I2)
                             ? I2->getNumOperands()
                             : std::min(I2->getNumOperands(), OpIdx1 + 1);
        assert(FromIdx <= ToIdx && "Bad index");
        for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
          // Skip operands already paired with OpIdx1.
          if (Op2Used.count(OpIdx2))
            continue;
          // Recursively calculate the cost at each level
          int TmpScore = getScoreAtLevelRec({I1->getOperand(OpIdx1), Lane1},
                                            {I2->getOperand(OpIdx2), Lane2},
                                            CurrLevel + 1, MaxLevel);
          // Look for the best score.
          if (TmpScore > VLOperands::ScoreFail && TmpScore > MaxTmpScore) {
            MaxTmpScore = TmpScore;
            MaxOpIdx2 = OpIdx2;
            FoundBest = true;
          }
        }
        if (FoundBest) {
          // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
          Op2Used.insert(MaxOpIdx2);
          ShallowScoreAtThisLevel += MaxTmpScore;
        }
      }
      return ShallowScoreAtThisLevel;
    }

    /// \Returns the look-ahead score, which tells us how much the sub-trees
    /// rooted at \p LHS and \p RHS match, the more they match the higher the
    /// score. This helps break ties in an informed way when we cannot decide on
    /// the order of the operands by just considering the immediate
    /// predecessors.
    int getLookAheadScore(const std::pair<Value *, int> &LHS,
                          const std::pair<Value *, int> &RHS) {
      InLookAheadValues.clear();
      return getScoreAtLevelRec(LHS, RHS, 1, LookAheadMaxDepth);
    }

    // Search all operands in Ops[*][Lane] for the one that matches best
    // Ops[OpIdx][LastLane] and return its opreand index.
    // If no good match can be found, return None.
@@ -763,9 +999,6 @@ public:
      // The linearized opcode of the operand at OpIdx, Lane.
      bool OpIdxAPO = getData(OpIdx, Lane).APO;

      const unsigned BestScore = 2;
      const unsigned GoodScore = 1;

      // The best operand index and its score.
      // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
      // are using the score to differentiate between the two.
@@ -794,41 +1027,19 @@ public:
        // Look for an operand that matches the current mode.
        switch (RMode) {
        case ReorderingMode::Load:
          if (isa<LoadInst>(Op)) {
            // Figure out which is left and right, so that we can check for
            // consecutive loads
        case ReorderingMode::Constant:
        case ReorderingMode::Opcode: {
          bool LeftToRight = Lane > LastLane;
          Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
          Value *OpRight = (LeftToRight) ? Op : OpLastLane;
            if (isConsecutiveAccess(cast<LoadInst>(OpLeft),
                                    cast<LoadInst>(OpRight), DL, SE))
              BestOp.Idx = Idx;
          }
          break;
        case ReorderingMode::Opcode:
          // We accept both Instructions and Undefs, but with different scores.
          if ((isa<Instruction>(Op) && isa<Instruction>(OpLastLane) &&
               cast<Instruction>(Op)->getOpcode() ==
                   cast<Instruction>(OpLastLane)->getOpcode()) ||
              (isa<UndefValue>(OpLastLane) && isa<Instruction>(Op)) ||
              isa<UndefValue>(Op)) {
            // An instruction has a higher score than an undef.
            unsigned Score = (isa<UndefValue>(Op)) ? GoodScore : BestScore;
          unsigned Score =
              getLookAheadScore({OpLeft, LastLane}, {OpRight, Lane});
          if (Score > BestOp.Score) {
            BestOp.Idx = Idx;
            BestOp.Score = Score;
          }
          }
          break;
        case ReorderingMode::Constant:
          if (isa<Constant>(Op)) {
            unsigned Score = (isa<UndefValue>(Op)) ? GoodScore : BestScore;
            if (Score > BestOp.Score) {
              BestOp.Idx = Idx;
              BestOp.Score = Score;
            }
        }
          break;
        case ReorderingMode::Splat:
          if (Op == OpLastLane)
            BestOp.Idx = Idx;
@@ -959,8 +1170,8 @@ public:
  public:
    /// Initialize with all the operands of the instruction vector \p RootVL.
    VLOperands(ArrayRef<Value *> RootVL, const DataLayout &DL,
               ScalarEvolution &SE)
        : DL(DL), SE(SE) {
               ScalarEvolution &SE, const BoUpSLP &R)
        : DL(DL), SE(SE), R(R) {
      // Append all the operands of RootVL.
      appendOperandsOfVL(RootVL);
    }
@@ -1189,7 +1400,8 @@ private:
                                             SmallVectorImpl<Value *> &Left,
                                             SmallVectorImpl<Value *> &Right,
                                             const DataLayout &DL,
                                             ScalarEvolution &SE);
                                             ScalarEvolution &SE,
                                             const BoUpSLP &R);
  struct TreeEntry {
    using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
    TreeEntry(VecTreeTy &Container) : Container(Container) {}
@@ -2550,7 +2762,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
        // Commutative predicate - collect + sort operands of the instructions
        // so that each side is more likely to have the same opcode.
        assert(P0 == SwapP0 && "Commutative Predicate mismatch");
        reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE);
        reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
      } else {
        // Collect operands - commute if it uses the swapped predicate.
        for (Value *V : VL) {
@@ -2597,7 +2809,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
      // have the same opcode.
      if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
        ValueList Left, Right;
        reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE);
        reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
        TE->setOperand(0, Left);
        TE->setOperand(1, Right);
        buildTree_rec(Left, Depth + 1, {TE, 0});
@@ -2789,7 +3001,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
      // Reorder operands if reordering would enable vectorization.
      if (isa<BinaryOperator>(VL0)) {
        ValueList Left, Right;
        reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE);
        reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
        TE->setOperand(0, Left);
        TE->setOperand(1, Right);
        buildTree_rec(Left, Depth + 1, {TE, 0});
@@ -3550,13 +3762,15 @@ int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const {

// Perform operand reordering on the instructions in VL and return the reordered
// operands in Left and Right.
void BoUpSLP::reorderInputsAccordingToOpcode(
    ArrayRef<Value *> VL, SmallVectorImpl<Value *> &Left,
    SmallVectorImpl<Value *> &Right, const DataLayout &DL,
    ScalarEvolution &SE) {
void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
                                             SmallVectorImpl<Value *> &Left,
                                             SmallVectorImpl<Value *> &Right,
                                             const DataLayout &DL,
                                             ScalarEvolution &SE,
                                             const BoUpSLP &R) {
  if (VL.empty())
    return;
  VLOperands Ops(VL, DL, SE);
  VLOperands Ops(VL, DL, SE, R);
  // Reorder the operands in place.
  Ops.reorder();
  Left = Ops.getVL(0);
+41 −58

File changed.

Preview size limit exceeded, changes collapsed.

+223 −33

File changed.

Preview size limit exceeded, changes collapsed.