Unverified Commit 783bf90e authored by Alexey Bataev's avatar Alexey Bataev Committed by GitHub
Browse files

[SLP]Vectorize operand chains of non-vectorizable instructions

Extend the post-process operand-chain seeding (previously only cmps)
to non-vectorizable calls, invokes, callbrs, non-trivially-vectorizable
intrinsics, atomicrmw, cmpxchg, returns, and stores. Stores are
processed after every other vectorization attempt in the basic block.

Reviewers: bababuck, hiraditya, RKSimon

Pull Request: https://github.com/llvm/llvm-project/pull/194248
parent 23483d49
Loading
Loading
Loading
Loading
+6 −0
Original line number Diff line number Diff line
@@ -143,6 +143,12 @@ private:
  bool vectorizeCmpInsts(iterator_range<ItT> CmpInsts, BasicBlock *BB,
                         slpvectorizer::BoUpSLP &R);

  /// Tries to vectorize the operand chains of the non-vectorizable
  /// instructions in \p Insts.
  template <typename ItT>
  bool vectorizeNonVectorizableInsts(iterator_range<ItT> Insts, BasicBlock *BB,
                                     slpvectorizer::BoUpSLP &R);

  /// Tries to vectorize constructs started from InsertValueInst or
  /// InsertElementInst instructions.
  bool vectorizeInserts(InstSetVector &Instructions, BasicBlock *BB,
+315 −15
Original line number Diff line number Diff line
@@ -232,6 +232,15 @@ static cl::opt<bool> VectorizeNonPowerOf2(
    "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
    cl::desc("Try to vectorize with non-power-of-2 number of elements."));
static cl::opt<bool> ForcePostProcessStoresOperands(
    "slp-postprocess-stores-operands", cl::init(false), cl::Hidden,
    cl::desc("Force vectorization of non-vectorizable stores operands."));
static cl::opt<bool> NonVectReductions(
    "slp-non-vectorizables-as-reductions", cl::init(false), cl::Hidden,
    cl::desc(
        "Use  non-vectorizable instructions as potential reduction roots."));
/// True when \p slp-vectorize-non-power-of-2 is enabled and \p NumElts is a
/// supported non-power-of-2 width: \p NumElts + 1 must be a power of two
/// (e.g. 3 or 7 lanes, i.e. almost a full power-of-2 register).
@@ -7005,11 +7014,12 @@ getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind,
/// This is similar to TargetTransformInfo::getScalarizationOverhead, but if
/// ScalarTy is a FixedVectorType, a vector will be inserted or extracted
/// instead of a scalar.
static InstructionCost
getScalarizationOverhead(const TargetTransformInfo &TTI, Type *ScalarTy,
                         VectorType *Ty, const APInt &DemandedElts, bool Insert,
                         bool Extract, TTI::TargetCostKind CostKind,
                         bool ForPoisonSrc = true, ArrayRef<Value *> VL = {}) {
static InstructionCost getScalarizationOverhead(
    const TargetTransformInfo &TTI, Type *ScalarTy, VectorType *Ty,
    const APInt &DemandedElts, bool Insert, bool Extract,
    TTI::TargetCostKind CostKind, bool ForPoisonSrc = true,
    ArrayRef<Value *> VL = {},
    TTI::VectorInstrContext VIC = TTI::VectorInstrContext::None) {
  assert(!isa<ScalableVectorType>(Ty) &&
         "ScalableVectorType is not supported.");
  assert(getNumElements(ScalarTy) * DemandedElts.getBitWidth() ==
@@ -7034,7 +7044,7 @@ getScalarizationOverhead(const TargetTransformInfo &TTI, Type *ScalarTy,
    return Cost;
  }
  return TTI.getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
                                      CostKind, ForPoisonSrc, VL);
                                      CostKind, ForPoisonSrc, VL, VIC);
}
/// This is similar to TargetTransformInfo::getVectorInstrCost, but if ScalarTy
@@ -30263,6 +30273,245 @@ bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,
  return Changed;
}
/// Returns true if \p I is an instruction whose result the SLP vectorizer
/// cannot turn into a vector instruction directly, but whose operand chains
/// may still be worth vectorizing as bundle seeds.
static bool isNonVectorizableInst(const Instruction *I,
                                  const TargetLibraryInfo *TLI) {
  if (const auto *CB = dyn_cast<CallBase>(I)) {
    if (CB->isInlineAsm())
      return false;
    if (const auto *II = dyn_cast<IntrinsicInst>(CB)) {
      if (II->isAssumeLikeIntrinsic())
        return false;
      if (isa<AnyMemIntrinsic>(II))
        return false;
    }
    if (const auto *CI = dyn_cast<CallInst>(CB)) {
      Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
      if (isTriviallyVectorizable(ID))
        return false;
      if (!VFDatabase::getMappings(*CI).empty())
        return false;
      if (all_of(CI->args(), [](const Value *Arg) {
            return !isa<Instruction>(Arg) || Arg->getType()->isPointerTy();
          }))
        return false;
      if (any_of(CI->args(), [](const Value *Arg) {
            return Arg->getType()->isPointerTy();
          }))
        return false;
    }
    // Skip vector-returning calls in non-revec mode - we cannot turn their
    // results into wider vectors here.
    return SLPReVec || !CB->getType()->isVectorTy();
  }
  if (isa<AtomicRMWInst, AtomicCmpXchgInst>(I))
    return true;
  if (const auto *RI = dyn_cast<ReturnInst>(I))
    return RI->getNumOperands() > 0 &&
           (SLPReVec || !I->getOperand(0)->getType()->isVectorTy()) &&
           isa<Instruction>(I->getOperand(0));
  return false;
}
/// Visits the value operands of \p I that are candidates for operand-chain
/// vectorization.
template <typename Func>
static void forEachOperandChainCandidate(Instruction *I, Func F,
                                         bool ForReduction) {
  if (auto *CB = dyn_cast<CallBase>(I)) {
    if (ForReduction && !NonVectReductions && CB->arg_size() > 1)
      return;
    for (auto [Idx, U] : enumerate(CB->args()))
      F(U.get(), Idx);
    return;
  }
  if (auto *AI = dyn_cast<AtomicRMWInst>(I)) {
    F(AI->getValOperand(), 0);
    return;
  }
  if (auto *AI = dyn_cast<AtomicCmpXchgInst>(I)) {
    F(AI->getCompareOperand(), 0);
    F(AI->getNewValOperand(), 1);
    return;
  }
  if (ForReduction && !NonVectReductions)
    return;
  if (auto *SI = dyn_cast<StoreInst>(I)) {
    F(SI->getValueOperand(), 0);
    return;
  }
  if (auto *RI = dyn_cast<ReturnInst>(I)) {
    if (RI->getNumOperands() > 0)
      F(RI->getReturnValue(), 0);
    return;
  }
  llvm_unreachable("Unexpected instruction kind for operand-chain seeding");
}
template <typename ItT>
bool SLPVectorizerPass::vectorizeNonVectorizableInsts(
    iterator_range<ItT> InstRange, BasicBlock *BB, BoUpSLP &R) {
  SmallVector<Instruction *> Insts(InstRange);
  if (Insts.empty())
    return false;
  stable_sort(Insts, [](const Instruction *A, const Instruction *B) {
    return A->getOpcode() < B->getOpcode();
  });
  bool Changed = false;
  // Pass 1 - try to find horizontal reductions feeding the root operands.
  SmallPtrSet<Value *, 8> RootSeen;
  for (Instruction *I : Insts) {
    if (R.isDeleted(I))
      continue;
    bool RootDeleted = false;
    forEachOperandChainCandidate(
        I,
        [&](Value *Op, unsigned /*Position*/) {
          if (RootDeleted)
            return;
          auto *RootOp = dyn_cast<Instruction>(Op);
          if (!RootOp || RootOp->getParent() != BB || R.isDeleted(RootOp) ||
              isa<ShuffleVectorInst>(RootOp) ||
              !isValidElementType(RootOp->getType()))
            return;
          if (!RootSeen.insert(RootOp).second)
            return;
          Changed |= vectorizeRootInstruction(nullptr, RootOp, BB, R);
          if (R.isDeleted(I))
            RootDeleted = true;
        },
        /*ForReduction=*/true);
  }
  // Pass 2 - collect the operand instructions across all roots and try to
  // vectorize them as bundles.
  if (Insts.size() < 2)
    return Changed;
  struct OperandGroupKey {
    enum class Kind : unsigned {
      NonCall = 0,
      Intrinsic,
      NamedFunction,
      IndirectCall,
    };
    Kind RootKind;
    unsigned KindID;    // Intrinsic ID for Intrinsic, opcode for NonCall,
                        // callee value-kind for IndirectCall, 0 for
                        // NamedFunction.
    unsigned SubOp;     // AtomicRMW operation; 0 otherwise.
    StringRef FuncName; // Non-empty only for NamedFunction.
    unsigned Position;  // Operand slot within the root.
    bool operator==(const OperandGroupKey &O) const {
      return RootKind == O.RootKind && KindID == O.KindID && SubOp == O.SubOp &&
             FuncName == O.FuncName && Position == O.Position;
    }
    bool operator!=(const OperandGroupKey &O) const { return !(*this == O); }
    bool less(const OperandGroupKey &O) const {
      if (RootKind != O.RootKind)
        return static_cast<unsigned>(RootKind) <
               static_cast<unsigned>(O.RootKind);
      if (KindID != O.KindID)
        return KindID < O.KindID;
      if (SubOp != O.SubOp)
        return SubOp < O.SubOp;
      if (int C = FuncName.compare(O.FuncName))
        return C < 0;
      return Position < O.Position;
    }
  };
  SmallDenseMap<Value *, OperandGroupKey> OpKeys;
  auto BuildKey = [](Instruction *I, unsigned Position) -> OperandGroupKey {
    if (auto *CB = dyn_cast<CallBase>(I)) {
      if (auto *II = dyn_cast<IntrinsicInst>(CB))
        return {OperandGroupKey::Kind::Intrinsic,
                II->getIntrinsicID(),
                0,
                {},
                Position};
      if (Function *F = CB->getCalledFunction())
        return {OperandGroupKey::Kind::NamedFunction, 0, 0, F->getName(),
                Position};
      return {OperandGroupKey::Kind::IndirectCall,
              CB->getCalledOperand()->getValueID(),
              0,
              {},
              Position};
    }
    unsigned SubOp = 0;
    if (auto *AI = dyn_cast<AtomicRMWInst>(I))
      SubOp = static_cast<unsigned>(AI->getOperation());
    return {
        OperandGroupKey::Kind::NonCall, I->getOpcode(), SubOp, {}, Position};
  };
  auto OperandSorter = [&OpKeys](Value *V1, Value *V2) -> bool {
    if (V1 == V2)
      return false;
    const OperandGroupKey &K1 = OpKeys.at(V1);
    const OperandGroupKey &K2 = OpKeys.at(V2);
    if (K1 != K2)
      return K1.less(K2);
    auto *I1 = cast<Instruction>(V1);
    auto *I2 = cast<Instruction>(V2);
    if (I1->getType()->getTypeID() != I2->getType()->getTypeID())
      return I1->getType()->getTypeID() < I2->getType()->getTypeID();
    if (I1->getType()->getScalarSizeInBits() !=
        I2->getType()->getScalarSizeInBits())
      return I1->getType()->getScalarSizeInBits() <
             I2->getType()->getScalarSizeInBits();
    if (I1->getOpcode() != I2->getOpcode())
      return I1->getOpcode() < I2->getOpcode();
    return I1->comesBefore(I2);
  };
  auto AreCompatibleOperands = [&OpKeys](ArrayRef<Value *> VL,
                                         Value *V) -> bool {
    if (VL.empty() || VL.back() == V)
      return true;
    const OperandGroupKey &KBack = OpKeys.at(VL.back());
    const OperandGroupKey &K = OpKeys.at(V);
    if (KBack != K)
      return false;
    auto *I1 = cast<Instruction>(VL.back());
    auto *I2 = cast<Instruction>(V);
    return I1->getType() == I2->getType() && I1->getOpcode() == I2->getOpcode();
  };
  SmallVector<Value *> Operands;
  SmallPtrSet<Value *, 8> Seen;
  for (Instruction *I : Insts) {
    if (R.isDeleted(I))
      continue;
    forEachOperandChainCandidate(
        I,
        [&](Value *Op, unsigned Position) {
          auto *OpI = dyn_cast<Instruction>(Op);
          if (!OpI || OpI->getParent() != BB || R.isDeleted(OpI) ||
              isa<ShuffleVectorInst>(OpI) ||
              !isValidElementType(OpI->getType()))
            return;
          if (!Seen.insert(OpI).second)
            return;
          OpKeys.try_emplace(OpI, BuildKey(I, Position));
          Operands.push_back(OpI);
        },
        /*ForReduction=*/false);
  }
  if (Operands.size() <= 1)
    return Changed;
  Changed |= tryToVectorizeSequence<Value>(
      Operands, OperandSorter, AreCompatibleOperands,
      [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
        return tryToVectorizeList(Candidates, R, MaxVFOnly);
      },
      /*MaxVFOnly=*/true, R);
  return Changed;
}
bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
                                         BasicBlock *BB, BoUpSLP &R) {
  assert(all_of(Instructions, IsaPred<InsertElementInst, InsertValueInst>) &&
@@ -30535,21 +30784,33 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
  InstSetVector PostProcessInserts;
  SmallSetVector<CmpInst *, 8> PostProcessCmps;
  // Vectorizes Inserts in `PostProcessInserts` and if `VectorizeCmps` is true
  // also vectorizes `PostProcessCmps`.
  auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) {
  // Non-vectorizable root instructions other than stores: calls
  // (regular and intrinsic), invokes, callbrs, atomic RMW/cmpxchg, and
  // returns.
  SmallSetVector<Instruction *, 8> PostProcessInsts;
  // Stores are processed after all other instructions/roots.
  SmallSetVector<StoreInst *, 8> PostProcessStores;
  auto VectorizeInsertsAndCmps = [&](bool AtTerminator) {
    bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
    if (VectorizeCmps) {
    if (AtTerminator) {
      Changed |= vectorizeCmpInsts(reverse(PostProcessCmps), BB, R);
      PostProcessCmps.clear();
      if (!PostProcessInsts.empty())
        Changed |=
            vectorizeNonVectorizableInsts(reverse(PostProcessInsts), BB, R);
      PostProcessInsts.clear();
    }
    PostProcessInserts.clear();
    return Changed;
  };
  // Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`.
  // Returns true if `I` is in any of the post-process sets.
  auto IsInPostProcessInstrs = [&](Instruction *I) {
    if (auto *Cmp = dyn_cast<CmpInst>(I))
      return PostProcessCmps.contains(Cmp);
    if (PostProcessInsts.contains(I))
      return true;
    if (auto *SI = dyn_cast<StoreInst>(I))
      return PostProcessStores.contains(SI);
    return isa<InsertElementInst, InsertValueInst>(I) &&
           PostProcessInserts.contains(I);
  };
@@ -30572,7 +30833,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
    // We may go through BB multiple times so skip the one we have checked.
    if (!VisitedInstrs.insert(&*It).second) {
      if (HasNoUsers(&*It) &&
          VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator())) {
          VectorizeInsertsAndCmps(/*AtTerminator=*/It->isTerminator())) {
        // We would like to start over since some instructions are deleted
        // and the iterator may become invalid value.
        Changed = true;
@@ -30652,7 +30913,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
      // top-tree instructions to try to vectorize as many instructions as
      // possible.
      OpsChanged |=
          VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator());
          VectorizeInsertsAndCmps(/*AtTerminator=*/It->isTerminator());
      if (OpsChanged) {
        // We would like to start over since some instructions are deleted
        // and the iterator may become invalid value.
@@ -30665,8 +30926,47 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
    if (isa<InsertElementInst, InsertValueInst>(It))
      PostProcessInserts.insert(&*It);
    else if (isa<CmpInst>(It))
      PostProcessCmps.insert(cast<CmpInst>(&*It));
    else if (auto *CI = dyn_cast<CmpInst>(It))
      PostProcessCmps.insert(CI);
    else if (auto *SI = dyn_cast<StoreInst>(It);
             SI &&
             (SLPReVec || !SI->getValueOperand()->getType()->isVectorTy()) &&
             isa<Instruction>(SI->getValueOperand()))
      PostProcessStores.insert(SI);
    else if (isNonVectorizableInst(&*It, TLI))
      PostProcessInsts.insert(&*It);
  }
  // Late post-process: run operand-chain vectorization for stores.
  if (!PostProcessStores.empty() &&
      (NonVectReductions || PostProcessStores.size() >= 2)) {
    if (!ForcePostProcessStoresOperands && SLPCostThreshold >= 0) {
      // Use pessimistic cost estimation to avoid long compile time when there
      // are many stores in the list.
      Type *ScalarTy = getValueType(PostProcessStores.front());
      if (!::isValidElementType(ScalarTy))
        return Changed;
      if (!NonVectReductions && PostProcessStores.size() == 2 &&
          cast<Instruction>(PostProcessStores.front()->getValueOperand())
                  ->getOpcode() !=
              cast<Instruction>(PostProcessStores.back()->getValueOperand())
                  ->getOpcode())
        return Changed;
      ScalarTy =
          IntegerType::get(ScalarTy->getContext(),
                           DL->getTypeSizeInBits(ScalarTy->getScalarType()));
      if (auto *ValTy = dyn_cast<VectorType>(
              PostProcessStores.front()->getValueOperand()->getType()))
        ScalarTy = ::getWidenedType(ScalarTy, getNumElements(ValTy));
      auto *VecTy = ::getWidenedType(ScalarTy, PostProcessStores.size());
      InstructionCost ExtractsCost = ::getScalarizationOverhead(
          *TTI, ScalarTy, VecTy, APInt::getAllOnes(PostProcessStores.size()),
          /*Insert=*/false, /*Extract=*/true, TTI::TCK_RecipThroughput,
          /*ForPoisonSrc=*/true, {}, TTI::VectorInstrContext::Store);
      if (ExtractsCost > PostProcessStores.size() + 1)
        return Changed;
    }
    Changed |= vectorizeNonVectorizableInsts(reverse(PostProcessStores), BB, R);
  }
  return Changed;
+0 −2
Original line number Diff line number Diff line
@@ -12,8 +12,6 @@ target triple = "aarch64--linux-gnu"
; REMARK-NEXT:    - String: 'Vectorized horizontal reduction with cost '
; REMARK-NEXT:    - Cost: '-8'
;
; REMARK-NOT: Function: gather_load

define internal i32 @gather_multiple_use(i32 %a, i32 %b, i32 %c, i32 %d) {
; CHECK-LABEL: @gather_multiple_use(
; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[C:%.*]], i32 0
+61 −98

File changed.

Preview size limit exceeded, changes collapsed.

+4 −3
Original line number Diff line number Diff line
@@ -5,11 +5,12 @@
define void @strided_load_and_store(ptr %in, ptr %out) {
; CHECK-LABEL: @strided_load_and_store(
; CHECK-NEXT:  entry:
; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[IN:%.*]], i64 16
; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[IN]], align 2
; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[TMP0]], align 2
; CHECK-NEXT:    [[TMP0:%.*]] = call <2 x i64> @llvm.experimental.vp.strided.load.v2i64.p0.i64(ptr align 2 [[IN:%.*]], i64 16, <2 x i1> splat (i1 true), i32 2)
; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8>
; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[OUT:%.*]], i64 16
; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[TMP4]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT:    store <8 x i8> [[TMP1]], ptr [[OUT]], align 2
; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP4]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
; CHECK-NEXT:    store <8 x i8> [[TMP2]], ptr [[TMP3]], align 2
; CHECK-NEXT:    ret void
;
Loading