BasicTTIImpl.h 74.9 KB
Newer Older
1
2
//===- BasicTTIImpl.h -------------------------------------------*- C++ -*-===//
//
3
4
5
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
7
//
//===----------------------------------------------------------------------===//
8
//
9
10
11
12
/// \file
/// This file provides a helper that implements much of the TTI interface in
/// terms of the target-independent code generator and TargetLowering
/// interfaces.
13
//
14
15
16
17
18
//===----------------------------------------------------------------------===//

#ifndef LLVM_CODEGEN_BASICTTIIMPL_H
#define LLVM_CODEGEN_BASICTTIIMPL_H

19
20
21
22
23
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
24
#include "llvm/Analysis/LoopInfo.h"
25
#include "llvm/Analysis/TargetTransformInfo.h"
26
#include "llvm/Analysis/TargetTransformInfoImpl.h"
27
#include "llvm/CodeGen/ISDOpcodes.h"
28
29
#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
30
#include "llvm/CodeGen/ValueTypes.h"
31
32
33
34
35
36
37
38
39
40
41
42
43
44
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/Operator.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Value.h"
#include "llvm/MC/MCSchedule.h"
#include "llvm/Support/Casting.h"
45
#include "llvm/Support/CommandLine.h"
46
#include "llvm/Support/ErrorHandling.h"
47
#include "llvm/Support/MachineValueType.h"
48
49
50
51
52
53
#include "llvm/Support/MathExtras.h"
#include <algorithm>
#include <cassert>
#include <cstdint>
#include <limits>
#include <utility>
54
55
56

namespace llvm {

57
58
59
60
61
62
63
class Function;
class GlobalValue;
class LLVMContext;
class ScalarEvolution;
class SCEV;
class TargetMachine;

64
65
extern cl::opt<unsigned> PartialUnrollingThreshold;

66
/// Base class which can be used to help build a TTI implementation.
67
68
69
70
///
/// This class provides as much implementation of the TTI interface as is
/// possible using the target independent parts of the code generator.
///
71
72
73
74
/// In order to subclass it, your class must implement a getST() method to
/// return the subtarget, and a getTLI() method to return the target lowering.
/// We need these methods implemented in the derived class so that this class
/// doesn't have to duplicate storage for them.
75
76
77
template <typename T>
class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
private:
78
79
  using BaseT = TargetTransformInfoImplCRTPBase<T>;
  using TTI = TargetTransformInfo;
80

81
82
  /// Estimate a cost of Broadcast as an extract and sequence of insert
  /// operations.
83
  unsigned getBroadcastShuffleOverhead(FixedVectorType *VTy) {
84
85
86
87
    unsigned Cost = 0;
    // Broadcast cost is equal to the cost of extracting the zero'th element
    // plus the cost of inserting it into every element of the result vector.
    Cost += static_cast<T *>(this)->getVectorInstrCost(
88
        Instruction::ExtractElement, VTy, 0);
89

90
    for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
91
      Cost += static_cast<T *>(this)->getVectorInstrCost(
92
          Instruction::InsertElement, VTy, i);
93
94
95
96
    }
    return Cost;
  }

97
98
  /// Estimate a cost of shuffle as a sequence of extract and insert
  /// operations.
99
  unsigned getPermuteShuffleOverhead(FixedVectorType *VTy) {
100
101
102
103
104
105
106
107
    unsigned Cost = 0;
    // Shuffle cost is equal to the cost of extracting element from its argument
    // plus the cost of inserting them onto the result vector.

    // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from
    // index 0 of first vector, index 1 of second vector,index 2 of first
    // vector and finally index 3 of second vector and insert them at index
    // <0,1,2,3> of result vector.
108
109
110
111
112
    for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
      Cost += static_cast<T *>(this)->getVectorInstrCost(
          Instruction::InsertElement, VTy, i);
      Cost += static_cast<T *>(this)->getVectorInstrCost(
          Instruction::ExtractElement, VTy, i);
113
114
115
116
    }
    return Cost;
  }

117
118
  /// Estimate a cost of subvector extraction as a sequence of extract and
  /// insert operations.
119
120
  unsigned getExtractSubvectorOverhead(FixedVectorType *VTy, int Index,
                                       FixedVectorType *SubVTy) {
121
    assert(VTy && SubVTy &&
122
           "Can only extract subvectors from vectors");
123
124
    int NumSubElts = SubVTy->getNumElements();
    assert((Index + NumSubElts) <= (int)VTy->getNumElements() &&
125
126
127
128
129
130
131
132
           "SK_ExtractSubvector index out of range");

    unsigned Cost = 0;
    // Subvector extraction cost is equal to the cost of extracting element from
    // the source type plus the cost of inserting them into the result vector
    // type.
    for (int i = 0; i != NumSubElts; ++i) {
      Cost += static_cast<T *>(this)->getVectorInstrCost(
133
          Instruction::ExtractElement, VTy, i + Index);
134
      Cost += static_cast<T *>(this)->getVectorInstrCost(
135
          Instruction::InsertElement, SubVTy, i);
136
137
138
139
    }
    return Cost;
  }

140
141
  /// Estimate a cost of subvector insertion as a sequence of extract and
  /// insert operations.
142
143
  unsigned getInsertSubvectorOverhead(FixedVectorType *VTy, int Index,
                                      FixedVectorType *SubVTy) {
144
    assert(VTy && SubVTy &&
145
           "Can only insert subvectors into vectors");
146
147
    int NumSubElts = SubVTy->getNumElements();
    assert((Index + NumSubElts) <= (int)VTy->getNumElements() &&
148
149
150
151
152
153
154
155
           "SK_InsertSubvector index out of range");

    unsigned Cost = 0;
    // Subvector insertion cost is equal to the cost of extracting element from
    // the source type plus the cost of inserting them into the result vector
    // type.
    for (int i = 0; i != NumSubElts; ++i) {
      Cost += static_cast<T *>(this)->getVectorInstrCost(
156
          Instruction::ExtractElement, SubVTy, i);
157
      Cost += static_cast<T *>(this)->getVectorInstrCost(
158
          Instruction::InsertElement, VTy, i + Index);
159
160
161
162
    }
    return Cost;
  }

163
  /// Local query method delegates up to T which *must* implement this!
164
165
  const TargetSubtargetInfo *getST() const {
    return static_cast<const T *>(this)->getST();
166
167
  }

168
  /// Local query method delegates up to T which *must* implement this!
169
  const TargetLoweringBase *getTLI() const {
170
    return static_cast<const T *>(this)->getTLI();
171
172
  }

173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
  static ISD::MemIndexedMode getISDIndexedMode(TTI::MemIndexedMode M) {
    switch (M) {
      case TTI::MIM_Unindexed:
        return ISD::UNINDEXED;
      case TTI::MIM_PreInc:
        return ISD::PRE_INC;
      case TTI::MIM_PreDec:
        return ISD::PRE_DEC;
      case TTI::MIM_PostInc:
        return ISD::POST_INC;
      case TTI::MIM_PostDec:
        return ISD::POST_DEC;
    }
    llvm_unreachable("Unexpected MemIndexedMode");
  }

189
protected:
190
191
  explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL)
      : BaseT(DL) {}
192
  virtual ~BasicTTIImplBase() = default;
193

194
195
  using TargetTransformInfoImplBase::DL;

196
197
198
public:
  /// \name Scalar TTI Implementations
  /// @{
199
200
201
  bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth,
                                      unsigned AddressSpace, unsigned Alignment,
                                      bool *Fast) const {
202
    EVT E = EVT::getIntegerVT(Context, BitWidth);
203
204
    return getTLI()->allowsMisalignedMemoryAccesses(
        E, AddressSpace, Alignment, MachineMemOperand::MONone, Fast);
205
  }
206
207
208

  bool hasBranchDivergence() { return false; }

209
210
  bool useGPUDivergenceAnalysis() { return false; }

211
212
  bool isSourceOfDivergence(const Value *V) { return false; }

213
214
  bool isAlwaysUniform(const Value *V) { return false; }

215
216
217
218
219
  unsigned getFlatAddressSpace() {
    // Return an invalid address space.
    return -1;
  }

220
221
222
223
224
  bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
                                  Intrinsic::ID IID) const {
    return false;
  }

225
226
227
228
  bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const {
    return getTLI()->isNoopAddrSpaceCast(FromAS, ToAS);
  }

229
230
231
  Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
                                          Value *NewV) const {
    return nullptr;
232
233
  }

234
235
236
237
238
239
240
241
242
  bool isLegalAddImmediate(int64_t imm) {
    return getTLI()->isLegalAddImmediate(imm);
  }

  bool isLegalICmpImmediate(int64_t imm) {
    return getTLI()->isLegalICmpImmediate(imm);
  }

  bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
243
                             bool HasBaseReg, int64_t Scale,
Jonas Paulsson's avatar
Jonas Paulsson committed
244
                             unsigned AddrSpace, Instruction *I = nullptr) {
245
246
247
248
249
    TargetLoweringBase::AddrMode AM;
    AM.BaseGV = BaseGV;
    AM.BaseOffs = BaseOffset;
    AM.HasBaseReg = HasBaseReg;
    AM.Scale = Scale;
Jonas Paulsson's avatar
Jonas Paulsson committed
250
    return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I);
251
252
  }

253
254
255
256
257
258
259
260
261
262
263
264
  bool isIndexedLoadLegal(TTI::MemIndexedMode M, Type *Ty,
                          const DataLayout &DL) const {
    EVT VT = getTLI()->getValueType(DL, Ty);
    return getTLI()->isIndexedLoadLegal(getISDIndexedMode(M), VT);
  }

  bool isIndexedStoreLegal(TTI::MemIndexedMode M, Type *Ty,
                           const DataLayout &DL) const {
    EVT VT = getTLI()->getValueType(DL, Ty);
    return getTLI()->isIndexedStoreLegal(getISDIndexedMode(M), VT);
  }

265
266
267
268
  bool isLSRCostLess(TTI::LSRCost C1, TTI::LSRCost C2) {
    return TargetTransformInfoImplBase::isLSRCostLess(C1, C2);
  }

269
270
271
272
  bool isProfitableLSRChainElement(Instruction *I) {
    return TargetTransformInfoImplBase::isProfitableLSRChainElement(I);
  }

273
  int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
274
                           bool HasBaseReg, int64_t Scale, unsigned AddrSpace) {
275
276
277
278
279
    TargetLoweringBase::AddrMode AM;
    AM.BaseGV = BaseGV;
    AM.BaseOffs = BaseOffset;
    AM.HasBaseReg = HasBaseReg;
    AM.Scale = Scale;
280
    return getTLI()->getScalingFactorCost(DL, AM, Ty, AddrSpace);
281
282
283
284
285
286
  }

  bool isTruncateFree(Type *Ty1, Type *Ty2) {
    return getTLI()->isTruncateFree(Ty1, Ty2);
  }

287
288
289
290
  bool isProfitableToHoist(Instruction *I) {
    return getTLI()->isProfitableToHoist(I);
  }

291
292
  bool useAA() const { return getST()->useAA(); }

293
  bool isTypeLegal(Type *Ty) {
294
    EVT VT = getTLI()->getValueType(DL, Ty);
295
296
297
    return getTLI()->isTypeLegal(VT);
  }

298
299
300
301
302
  int getGEPCost(Type *PointeeType, const Value *Ptr,
                 ArrayRef<const Value *> Operands) {
    return BaseT::getGEPCost(PointeeType, Ptr, Operands);
  }

303
  unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
304
305
306
                                            unsigned &JumpTableSize,
                                            ProfileSummaryInfo *PSI,
                                            BlockFrequencyInfo *BFI) {
307
    /// Try to find the estimated number of clusters. Note that the number of
Simon Pilgrim's avatar
Simon Pilgrim committed
308
    /// clusters identified in this function could be different from the actual
309
310
311
312
313
314
315
316
317
318
319
320
321
    /// numbers found in lowering. This function ignore switches that are
    /// lowered with a mix of jump table / bit test / BTree. This function was
    /// initially intended to be used when estimating the cost of switch in
    /// inline cost heuristic, but it's a generic cost model to be used in other
    /// places (e.g., in loop unrolling).
    unsigned N = SI.getNumCases();
    const TargetLoweringBase *TLI = getTLI();
    const DataLayout &DL = this->getDataLayout();

    JumpTableSize = 0;
    bool IsJTAllowed = TLI->areJTsAllowed(SI.getParent()->getParent());

    // Early exit if both a jump table and bit test are not allowed.
322
    if (N < 1 || (!IsJTAllowed && DL.getIndexSizeInBits(0u) < N))
323
324
325
326
327
328
329
330
331
332
333
334
335
      return N;

    APInt MaxCaseVal = SI.case_begin()->getCaseValue()->getValue();
    APInt MinCaseVal = MaxCaseVal;
    for (auto CI : SI.cases()) {
      const APInt &CaseVal = CI.getCaseValue()->getValue();
      if (CaseVal.sgt(MaxCaseVal))
        MaxCaseVal = CaseVal;
      if (CaseVal.slt(MinCaseVal))
        MinCaseVal = CaseVal;
    }

    // Check if suitable for a bit test
336
    if (N <= DL.getIndexSizeInBits(0u)) {
337
338
339
340
341
342
343
344
345
346
347
348
349
350
      SmallPtrSet<const BasicBlock *, 4> Dests;
      for (auto I : SI.cases())
        Dests.insert(I.getCaseSuccessor());

      if (TLI->isSuitableForBitTests(Dests.size(), N, MinCaseVal, MaxCaseVal,
                                     DL))
        return 1;
    }

    // Check if suitable for a jump table.
    if (IsJTAllowed) {
      if (N < 2 || N < TLI->getMinimumJumpTableEntries())
        return N;
      uint64_t Range =
351
352
          (MaxCaseVal - MinCaseVal)
              .getLimitedValue(std::numeric_limits<uint64_t>::max() - 1) + 1;
353
      // Check whether a range of clusters is dense enough for a jump table
354
      if (TLI->isSuitableForJumpTable(&SI, N, Range, PSI, BFI)) {
355
356
357
358
359
360
361
        JumpTableSize = Range;
        return 1;
      }
    }
    return N;
  }

362
363
364
365
366
367
368
369
  bool shouldBuildLookupTables() {
    const TargetLoweringBase *TLI = getTLI();
    return TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) ||
           TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other);
  }

  bool haveFastSqrt(Type *Ty) {
    const TargetLoweringBase *TLI = getTLI();
370
    EVT VT = TLI->getValueType(DL, Ty);
371
372
373
374
    return TLI->isTypeLegal(VT) &&
           TLI->isOperationLegalOrCustom(ISD::FSQRT, VT);
  }

375
376
377
378
  bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) {
    return true;
  }

379
  unsigned getFPOpCost(Type *Ty) {
380
381
382
383
384
385
386
    // Check whether FADD is available, as a proxy for floating-point in
    // general.
    const TargetLoweringBase *TLI = getTLI();
    EVT VT = TLI->getValueType(DL, Ty);
    if (TLI->isOperationLegalOrCustomOrPromote(ISD::FADD, VT))
      return TargetTransformInfo::TCC_Basic;
    return TargetTransformInfo::TCC_Expensive;
387
388
  }

389
390
  unsigned getInliningThresholdMultiplier() { return 1; }

391
392
  int getInlinerVectorBonusPercent() { return 150; }

393
394
  void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                               TTI::UnrollingPreferences &UP) {
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
    // This unrolling functionality is target independent, but to provide some
    // motivation for its intended use, for x86:

    // According to the Intel 64 and IA-32 Architectures Optimization Reference
    // Manual, Intel Core models and later have a loop stream detector (and
    // associated uop queue) that can benefit from partial unrolling.
    // The relevant requirements are:
    //  - The loop must have no more than 4 (8 for Nehalem and later) branches
    //    taken, and none of them may be calls.
    //  - The loop can have no more than 18 (28 for Nehalem and later) uops.

    // According to the Software Optimization Guide for AMD Family 15h
    // Processors, models 30h-4fh (Steamroller and later) have a loop predictor
    // and loop buffer which can benefit from partial unrolling.
    // The relevant requirements are:
    //  - The loop must have fewer than 16 branches
    //  - The loop must have less than 40 uops in all executed loop branches

    // The number of taken branches in a loop is hard to estimate here, and
    // benchmarking has revealed that it is better not to be conservative when
    // estimating the branch count. As a result, we'll ignore the branch limits
    // until someone finds a case where it matters in practice.

    unsigned MaxOps;
419
    const TargetSubtargetInfo *ST = getST();
420
421
422
423
424
425
426
427
    if (PartialUnrollingThreshold.getNumOccurrences() > 0)
      MaxOps = PartialUnrollingThreshold;
    else if (ST->getSchedModel().LoopMicroOpBufferSize > 0)
      MaxOps = ST->getSchedModel().LoopMicroOpBufferSize;
    else
      return;

    // Scan the loop: don't unroll loops with calls.
428
429
430
431
    for (BasicBlock *BB : L->blocks()) {
      for (Instruction &I : *BB) {
        if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
          if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
432
433
434
435
436
437
            if (!static_cast<T *>(this)->isLoweredToCall(F))
              continue;
          }

          return;
        }
438
      }
439
440
441
    }

    // Enable runtime and partial unrolling up to the specified size.
442
443
    // Enable using trip count upper bound to unroll loops.
    UP.Partial = UP.Runtime = UP.UpperBound = true;
444
445
446
447
448
    UP.PartialThreshold = MaxOps;

    // Avoid unrolling when optimizing for size.
    UP.OptSizeThreshold = 0;
    UP.PartialOptSizeThreshold = 0;
449
450
451
452

    // Set number of instructions optimized when "back edge"
    // becomes "fall through" to default value of 2.
    UP.BEInsns = 2;
453
454
  }

455
456
457
  bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
                                AssumptionCache &AC,
                                TargetLibraryInfo *LibInfo,
458
                                HardwareLoopInfo &HWLoopInfo) {
459
460
461
    return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
  }

462
463
464
465
466
467
468
  bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
                                   AssumptionCache &AC, TargetLibraryInfo *TLI,
                                   DominatorTree *DT,
                                   const LoopAccessInfo *LAI) {
    return BaseT::preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI);
  }

469
470
  bool emitGetActiveLaneMask() {
    return BaseT::emitGetActiveLaneMask();
471
472
  }

473
474
475
476
477
478
479
  int getInstructionLatency(const Instruction *I) {
    if (isa<LoadInst>(I))
      return getST()->getSchedModel().DefaultLoadLatency;

    return BaseT::getInstructionLatency(I);
  }

480
481
482
483
484
485
486
487
  virtual Optional<unsigned>
  getCacheSize(TargetTransformInfo::CacheLevel Level) const {
    return Optional<unsigned>(
      getST()->getCacheSize(static_cast<unsigned>(Level)));
  }

  virtual Optional<unsigned>
  getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const {
488
489
490
491
492
493
494
    Optional<unsigned> TargetResult =
        getST()->getCacheAssociativity(static_cast<unsigned>(Level));

    if (TargetResult)
      return TargetResult;

    return BaseT::getCacheAssociativity(Level);
495
496
497
498
499
500
501
502
503
504
  }

  virtual unsigned getCacheLineSize() const {
    return getST()->getCacheLineSize();
  }

  virtual unsigned getPrefetchDistance() const {
    return getST()->getPrefetchDistance();
  }

505
506
507
508
509
510
  virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses,
                                        unsigned NumStridedMemAccesses,
                                        unsigned NumPrefetches,
                                        bool HasCall) const {
    return getST()->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,
                                         NumPrefetches, HasCall);
511
512
513
514
515
516
  }

  virtual unsigned getMaxPrefetchIterationsAhead() const {
    return getST()->getMaxPrefetchIterationsAhead();
  }

517
518
519
520
  virtual bool enableWritePrefetching() const {
    return getST()->enableWritePrefetching();
  }

521
522
523
524
525
  /// @}

  /// \name Vector TTI Implementations
  /// @{

526
  unsigned getRegisterBitWidth(bool Vector) const { return 32; }
527

528
  /// Estimate the overhead of scalarizing an instruction. Insert and Extract
529
530
  /// are set if the demanded result elements need to be inserted and/or
  /// extracted from vectors.
531
  unsigned getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts,
532
                                    bool Insert, bool Extract) {
533
534
535
536
    /// FIXME: a bitfield is not a reasonable abstraction for talking about
    /// which elements are needed from a scalable vector
    auto *Ty = cast<FixedVectorType>(InTy);

537
    assert(DemandedElts.getBitWidth() == Ty->getNumElements() &&
538
539
           "Vector size mismatch");

540
541
    unsigned Cost = 0;

542
    for (int i = 0, e = Ty->getNumElements(); i < e; ++i) {
543
544
      if (!DemandedElts[i])
        continue;
545
      if (Insert)
546
        Cost += static_cast<T *>(this)->getVectorInstrCost(
547
            Instruction::InsertElement, Ty, i);
548
      if (Extract)
549
        Cost += static_cast<T *>(this)->getVectorInstrCost(
550
            Instruction::ExtractElement, Ty, i);
551
552
553
554
555
    }

    return Cost;
  }

556
  /// Helper wrapper for the DemandedElts variant of getScalarizationOverhead.
557
558
559
560
  unsigned getScalarizationOverhead(VectorType *InTy, bool Insert,
                                    bool Extract) {
    auto *Ty = cast<FixedVectorType>(InTy);

561
    APInt DemandedElts = APInt::getAllOnesValue(Ty->getNumElements());
562
563
564
565
    return static_cast<T *>(this)->getScalarizationOverhead(Ty, DemandedElts,
                                                            Insert, Extract);
  }

566
567
  /// Estimate the overhead of scalarizing an instructions unique
  /// non-constant operands. The types of the arguments are ordinarily
568
  /// scalar, in which case the costs are multiplied with VF.
569
570
571
572
573
  unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
                                            unsigned VF) {
    unsigned Cost = 0;
    SmallPtrSet<const Value*, 4> UniqueOperands;
    for (const Value *A : Args) {
574
      if (!isa<Constant>(A) && UniqueOperands.insert(A).second) {
575
576
        auto *VecTy = dyn_cast<VectorType>(A->getType());
        if (VecTy) {
577
          // If A is a vector operand, VF should be 1 or correspond to A.
578
579
          assert((VF == 1 ||
                  VF == cast<FixedVectorType>(VecTy)->getNumElements()) &&
580
                 "Vector argument does not match VF");
581
582
        }
        else
583
          VecTy = FixedVectorType::get(A->getType(), VF);
584
585
586

        Cost += getScalarizationOverhead(VecTy, false, true);
      }
587
    }
588

589
590
591
    return Cost;
  }

592
593
594
595
  unsigned getScalarizationOverhead(VectorType *InTy,
                                    ArrayRef<const Value *> Args) {
    auto *Ty = cast<FixedVectorType>(InTy);

596
597
    unsigned Cost = 0;

598
    Cost += getScalarizationOverhead(Ty, true, false);
599
    if (!Args.empty())
600
      Cost += getOperandsScalarizationOverhead(Args, Ty->getNumElements());
601
602
603
    else
      // When no information on arguments is provided, we add the cost
      // associated with one argument as a heuristic.
604
      Cost += getScalarizationOverhead(Ty, false, true);
605
606
607
608

    return Cost;
  }

609
  unsigned getMaxInterleaveFactor(unsigned VF) { return 1; }
610
611
612

  unsigned getArithmeticInstrCost(
      unsigned Opcode, Type *Ty,
613
      TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
614
615
616
      TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
      TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
      TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
617
      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
618
619
      ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
      const Instruction *CxtI = nullptr) {
620
621
622
623
624
    // Check if any of the operands are vector operands.
    const TargetLoweringBase *TLI = getTLI();
    int ISD = TLI->InstructionOpcodeToISD(Opcode);
    assert(ISD && "Invalid opcode");

625
626
627
628
629
630
631
    // TODO: Handle more cost kinds.
    if (CostKind != TTI::TCK_RecipThroughput)
      return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind,
                                           Opd1Info, Opd2Info,
                                           Opd1PropInfo, Opd2PropInfo,
                                           Args, CxtI);

632
    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
633

634
635
636
637
    bool IsFloat = Ty->isFPOrFPVectorTy();
    // Assume that floating point arithmetic operations cost twice as much as
    // integer operations.
    unsigned OpCost = (IsFloat ? 2 : 1);
638
639
640
641

    if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
      // The operation is legal. Assume it costs 1.
      // TODO: Once we have extract/insert subvector cost we need to use them.
642
      return LT.first * OpCost;
643
644
645
    }

    if (!TLI->isOperationExpand(ISD, LT.second)) {
Sanjay Patel's avatar
Sanjay Patel committed
646
647
      // If the operation is custom lowered, then assume that the code is twice
      // as expensive.
648
649
650
651
      return LT.first * 2 * OpCost;
    }

    // Else, assume that we need to scalarize this op.
652
653
    // TODO: If one of the types get legalized by splitting, handle this
    // similarly to what getCastInstrCost() does.
654
    if (auto *VTy = dyn_cast<VectorType>(Ty)) {
655
      unsigned Num = cast<FixedVectorType>(VTy)->getNumElements();
656
      unsigned Cost = static_cast<T *>(this)->getArithmeticInstrCost(
657
          Opcode, VTy->getScalarType(), CostKind);
658
659
      // Return the cost of multiple scalar invocation plus the cost of
      // inserting and extracting the values.
660
      return getScalarizationOverhead(VTy, Args) + Num * Cost;
661
662
663
664
665
666
    }

    // We don't know anything about this scalar instruction.
    return OpCost;
  }

667
668
  unsigned getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index,
                          VectorType *SubTp) {
669

670
    switch (Kind) {
671
    case TTI::SK_Broadcast:
672
      return getBroadcastShuffleOverhead(cast<FixedVectorType>(Tp));
673
    case TTI::SK_Select:
674
    case TTI::SK_Reverse:
675
676
677
    case TTI::SK_Transpose:
    case TTI::SK_PermuteSingleSrc:
    case TTI::SK_PermuteTwoSrc:
678
      return getPermuteShuffleOverhead(cast<FixedVectorType>(Tp));
679
    case TTI::SK_ExtractSubvector:
680
681
      return getExtractSubvectorOverhead(cast<FixedVectorType>(Tp), Index,
                                         cast<FixedVectorType>(SubTp));
682
    case TTI::SK_InsertSubvector:
683
684
      return getInsertSubvectorOverhead(cast<FixedVectorType>(Tp), Index,
                                        cast<FixedVectorType>(SubTp));
685
    }
686
    llvm_unreachable("Unknown TTI::ShuffleKind");
687
688
  }

689
  unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
690
                            TTI::TargetCostKind CostKind,
691
                            const Instruction *I = nullptr) {
692
693
694
    if (BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I) == 0)
      return 0;

695
696
697
    const TargetLoweringBase *TLI = getTLI();
    int ISD = TLI->InstructionOpcodeToISD(Opcode);
    assert(ISD && "Invalid opcode");
698
699
    std::pair<unsigned, MVT> SrcLT = TLI->getTypeLegalizationCost(DL, Src);
    std::pair<unsigned, MVT> DstLT = TLI->getTypeLegalizationCost(DL, Dst);
700

701
702
    unsigned SrcSize = SrcLT.second.getSizeInBits();
    unsigned DstSize = DstLT.second.getSizeInBits();
703
704
    bool IntOrPtrSrc = Src->isIntegerTy() || Src->isPointerTy();
    bool IntOrPtrDst = Dst->isIntegerTy() || Dst->isPointerTy();
705

706
707
708
709
710
711
712
713
714
    switch (Opcode) {
    default:
      break;
    case Instruction::Trunc:
      // Check for NOOP conversions.
      if (TLI->isTruncateFree(SrcLT.second, DstLT.second))
        return 0;
      LLVM_FALLTHROUGH;
    case Instruction::BitCast:
715
716
717
718
      // Bitcast between types that are legalized to the same type are free and
      // assume int to/from ptr of the same size is also free.
      if (SrcLT.first == DstLT.first && IntOrPtrSrc == IntOrPtrDst &&
          SrcSize == DstSize)
719
720
        return 0;
      break;
Sam Parker's avatar
Sam Parker committed
721
722
723
724
    case Instruction::FPExt:
      if (I && getTLI()->isExtFree(I))
        return 0;
      break;
725
726
    case Instruction::ZExt:
      if (TLI->isZExtFree(SrcLT.second, DstLT.second))
727
        return 0;
728
      LLVM_FALLTHROUGH;
Sam Parker's avatar
Sam Parker committed
729
730
731
732
733
734
735
    case Instruction::SExt:
      if (!I)
        break;

      if (getTLI()->isExtFree(I))
        return 0;

736
737
738
      // If this is a zext/sext of a load, return 0 if the corresponding
      // extending load exists on target.
      if (I && isa<LoadInst>(I->getOperand(0))) {
739
740
741
742
743
744
        EVT ExtVT = EVT::getEVT(Dst);
        EVT LoadVT = EVT::getEVT(Src);
        unsigned LType =
          ((Opcode == Instruction::ZExt) ? ISD::ZEXTLOAD : ISD::SEXTLOAD);
        if (TLI->isLoadExtLegal(LType, ExtVT, LoadVT))
          return 0;
745
746
747
748
749
750
751
      }
      break;
    case Instruction::AddrSpaceCast:
      if (TLI->isFreeAddrSpaceCast(Src->getPointerAddressSpace(),
                                   Dst->getPointerAddressSpace()))
        return 0;
      break;
752
753
    }

754
755
756
    auto *SrcVTy = dyn_cast<VectorType>(Src);
    auto *DstVTy = dyn_cast<VectorType>(Dst);

757
758
759
    // If the cast is marked as legal (or promote) then assume low cost.
    if (SrcLT.first == DstLT.first &&
        TLI->isOperationLegalOrPromote(ISD, DstLT.second))
760
      return SrcLT.first;
761
762

    // Handle scalar conversions.
763
    if (!SrcVTy && !DstVTy) {
764
765
766
767
768
769
770
771
772
773
      // Just check the op cost. If the operation is legal then assume it costs
      // 1.
      if (!TLI->isOperationExpand(ISD, DstLT.second))
        return 1;

      // Assume that illegal scalar instruction are expensive.
      return 4;
    }

    // Check vector-to-vector casts.
774
    if (DstVTy && SrcVTy) {
775
776
777
778
779
780
      // If the cast is between same-sized registers, then the check is simple.
      if (SrcLT.first == DstLT.first &&
          SrcLT.second.getSizeInBits() == DstLT.second.getSizeInBits()) {

        // Assume that Zext is done using AND.
        if (Opcode == Instruction::ZExt)
781
          return SrcLT.first;
782
783
784

        // Assume that sext is done using SHL and SRA.
        if (Opcode == Instruction::SExt)
785
          return SrcLT.first * 2;
786
787
788
789
790
791
792
793

        // Just check the op cost. If the operation is legal then assume it
        // costs
        // 1 and multiply by the type-legalization overhead.
        if (!TLI->isOperationExpand(ISD, DstLT.second))
          return SrcLT.first * 1;
      }

794
      // If we are legalizing by splitting, query the concrete TTI for the cost
Haicheng Wu's avatar
Haicheng Wu committed
795
      // of casting the original vector twice. We also need to factor in the
796
797
      // cost of the split itself. Count that as 1, to be consistent with
      // TLI->getTypeLegalizationCost().
798
799
800
801
802
803
      bool SplitSrc =
          TLI->getTypeAction(Src->getContext(), TLI->getValueType(DL, Src)) ==
          TargetLowering::TypeSplitVector;
      bool SplitDst =
          TLI->getTypeAction(Dst->getContext(), TLI->getValueType(DL, Dst)) ==
          TargetLowering::TypeSplitVector;
804
805
806
807
808
      if ((SplitSrc || SplitDst) &&
          cast<FixedVectorType>(SrcVTy)->getNumElements() > 1 &&
          cast<FixedVectorType>(DstVTy)->getNumElements() > 1) {
        Type *SplitDstTy = VectorType::getHalfElementsVectorType(DstVTy);
        Type *SplitSrcTy = VectorType::getHalfElementsVectorType(SrcVTy);
809
        T *TTI = static_cast<T *>(this);
810
811
812
813
        // If both types need to be split then the split is free.
        unsigned SplitCost =
            (!SplitSrc || !SplitDst) ? TTI->getVectorSplitCost() : 0;
        return SplitCost +
814
815
               (2 * TTI->getCastInstrCost(Opcode, SplitDstTy, SplitSrcTy,
                                          CostKind, I));
816
817
818
819
      }

      // In other cases where the source or destination are illegal, assume
      // the operation will get scalarized.
820
      unsigned Num = cast<FixedVectorType>(DstVTy)->getNumElements();
821
      unsigned Cost = static_cast<T *>(this)->getCastInstrCost(
822
823
          Opcode, Dst->getScalarType(), Src->getScalarType(),
          CostKind, I);
824
825
826

      // Return the cost of multiple scalar invocation plus the cost of
      // inserting and extracting the values.
827
      return getScalarizationOverhead(DstVTy, true, true) + Num * Cost;
828
829
830
831
832
833
    }

    // We already handled vector-to-vector and scalar-to-scalar conversions.
    // This
    // is where we handle bitcast between vectors and scalars. We need to assume
    //  that the conversion is scalarized in one way or another.
834
    if (Opcode == Instruction::BitCast) {
835
      // Illegal bitcasts are done by storing and loading from a stack slot.
836
837
838
      return (SrcVTy ? getScalarizationOverhead(SrcVTy, false, true) : 0) +
             (DstVTy ? getScalarizationOverhead(DstVTy, true, false) : 0);
    }
839
840
841
842

    llvm_unreachable("Unhandled cast");
  }

843
844
845
846
847
  unsigned getExtractWithExtendCost(unsigned Opcode, Type *Dst,
                                    VectorType *VecTy, unsigned Index) {
    return static_cast<T *>(this)->getVectorInstrCost(
               Instruction::ExtractElement, VecTy, Index) +
           static_cast<T *>(this)->getCastInstrCost(Opcode, Dst,
848
849
                                                    VecTy->getElementType(),
                                                    TTI::TCK_RecipThroughput);
850
851
  }

852
  unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind) {
853
    return BaseT::getCFInstrCost(Opcode, CostKind);
854
855
  }

856
  unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
857
858
                              TTI::TargetCostKind CostKind,
                              const Instruction *I = nullptr) {
859
860
861
862
    const TargetLoweringBase *TLI = getTLI();
    int ISD = TLI->InstructionOpcodeToISD(Opcode);
    assert(ISD && "Invalid opcode");

863
864
865
866
    // TODO: Handle other cost kinds.
    if (CostKind != TTI::TCK_RecipThroughput)
      return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind, I);

867
868
869
870
871
872
    // Selects on vectors are actually vector selects.
    if (ISD == ISD::SELECT) {
      assert(CondTy && "CondTy must exist");
      if (CondTy->isVectorTy())
        ISD = ISD::VSELECT;
    }
873
    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
874
875
876
877
878
879
880
881
882

    if (!(ValTy->isVectorTy() && !LT.second.isVector()) &&
        !TLI->isOperationExpand(ISD, LT.second)) {
      // The operation is legal. Assume it costs 1. Multiply
      // by the type-legalization overhead.
      return LT.first * 1;
    }

    // Otherwise, assume that the cast is scalarized.
883
884
    // TODO: If one of the types get legalized by splitting, handle this
    // similarly to what getCastInstrCost() does.
885
    if (auto *ValVTy = dyn_cast<VectorType>(ValTy)) {
886
      unsigned Num = cast<FixedVectorType>(ValVTy)->getNumElements();
887
888
889
      if (CondTy)
        CondTy = CondTy->getScalarType();
      unsigned Cost = static_cast<T *>(this)->getCmpSelInstrCost(
890
          Opcode, ValVTy->getScalarType(), CondTy, CostKind, I);
891
892

      // Return the cost of multiple scalar invocation plus the cost of
893
      // inserting and extracting the values.
894
      return getScalarizationOverhead(ValVTy, true, false) + Num * Cost;
895
896
897
898
899
900
901
902
    }

    // Unknown scalar opcode.
    return 1;
  }

  unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
    std::pair<unsigned, MVT> LT =
903
        getTLI()->getTypeLegalizationCost(DL, Val->getScalarType());
904
905
906
907

    return LT.first;
  }

908
909
  unsigned getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,
                           unsigned AddressSpace,
910
                           TTI::TargetCostKind CostKind,
911
                           const Instruction *I = nullptr) {
912
    assert(!Src->isVoidTy() && "Invalid type");
Sam Parker's avatar
Sam Parker committed
913
914
915
    // Assume types, such as structs, are expensive.
    if (getTLI()->getValueType(DL, Src,  true) == MVT::Other)
      return 4;
916
    std::pair<unsigned, MVT> LT = getTLI()->getTypeLegalizationCost(DL, Src);
917
918
919

    // Assuming that all loads of legal types cost 1.
    unsigned Cost = LT.first;
Sam Parker's avatar
Sam Parker committed
920
921
    if (CostKind != TTI::TCK_RecipThroughput)
      return Cost;
922
923
924
925
926
927
928

    if (Src->isVectorTy() &&
        Src->getPrimitiveSizeInBits() < LT.second.getSizeInBits()) {
      // This is a vector load that legalizes to a larger type than the vector
      // itself. Unless the corresponding extending load or truncating store is
      // legal, then this will scalarize.
      TargetLowering::LegalizeAction LA = TargetLowering::Expand;
929
930
931
932
933
      EVT MemVT = getTLI()->getValueType(DL, Src);
      if (Opcode == Instruction::Store)
        LA = getTLI()->getTruncStoreAction(LT.second, MemVT);
      else
        LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, LT.second, MemVT);
934
935
936
937

      if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) {
        // This is a vector load/store for some illegal type that is scalarized.
        // We must account for the cost of building or decomposing the vector.
938
939
        Cost += getScalarizationOverhead(cast<VectorType>(Src),
                                         Opcode != Instruction::Store,
940
941
942
943
944
945
946
                                         Opcode == Instruction::Store);
      }
    }

    return Cost;
  }

947
948
949
  unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
                                      unsigned Factor,
                                      ArrayRef<unsigned> Indices,
950
                                      unsigned Alignment, unsigned AddressSpace,
951
                                      TTI::TargetCostKind CostKind,
952
953
                                      bool UseMaskForCond = false,
                                      bool UseMaskForGaps = false) {
954
    auto *VT = cast<FixedVectorType>(VecTy);
955
956
957
958
959

    unsigned NumElts = VT->getNumElements();
    assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor");

    unsigned NumSubElts = NumElts / Factor;
960
    auto *SubVT = FixedVectorType::get(VT->getElementType(), NumSubElts);
961
962

    // Firstly, the cost of load/store operation.
963
    unsigned Cost;
964
    if (UseMaskForCond || UseMaskForGaps)
965
      Cost = static_cast<T *>(this)->getMaskedMemoryOpCost(
966
          Opcode, VecTy, Alignment, AddressSpace, CostKind);
967
    else
968
      Cost = static_cast<T *>(this)->getMemoryOpCost(
969
          Opcode, VecTy, MaybeAlign(Alignment), AddressSpace, CostKind);
970

971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
    // Legalize the vector type, and get the legalized and unlegalized type
    // sizes.
    MVT VecTyLT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
    unsigned VecTySize =
        static_cast<T *>(this)->getDataLayout().getTypeStoreSize(VecTy);
    unsigned VecTyLTSize = VecTyLT.getStoreSize();

    // Return the ceiling of dividing A by B.
    auto ceil = [](unsigned A, unsigned B) { return (A + B - 1) / B; };

    // Scale the cost of the memory operation by the fraction of legalized
    // instructions that will actually be used. We shouldn't account for the
    // cost of dead instructions since they will be removed.
    //
    // E.g., An interleaved load of factor 8:
    //       %vec = load <16 x i64>, <16 x i64>* %ptr
    //       %v0 = shufflevector %vec, undef, <0, 8>
    //
    // If <16 x i64> is legalized to 8 v2i64 loads, only 2 of the loads will be
    // used (those corresponding to elements [0:1] and [8:9] of the unlegalized
    // type). The other loads are unused.
    //
    // We only scale the cost of loads since interleaved store groups aren't
    // allowed to have gaps.
    if (Opcode == Instruction::Load && VecTySize > VecTyLTSize) {
      // The number of loads of a legal type it will take to represent a load
      // of the unlegalized vector type.
      unsigned NumLegalInsts = ceil(VecTySize, VecTyLTSize);

      // The number of elements of the unlegalized type that correspond to a
      // single legal instruction.
      unsigned NumEltsPerLegalInst = ceil(NumElts, NumLegalInsts);

      // Determine which legal instructions will be used.
      BitVector UsedInsts(NumLegalInsts, false);
      for (unsigned Index : Indices)
        for (unsigned Elt = 0; Elt < NumSubElts; ++Elt)
          UsedInsts.set((Index + Elt * Factor) / NumEltsPerLegalInst);

      // Scale the cost of the load by the fraction of legal instructions that
      // will be used.
      Cost *= UsedInsts.count() / NumLegalInsts;
    }

1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
    // Then plus the cost of interleave operation.
    if (Opcode == Instruction::Load) {
      // The interleave cost is similar to extract sub vectors' elements
      // from the wide vector, and insert them into sub vectors.
      //
      // E.g. An interleaved load of factor 2 (with one member of index 0):
      //      %vec = load <8 x i32>, <8 x i32>* %ptr
      //      %v0 = shuffle %vec, undef, <0, 2, 4, 6>         ; Index 0
      // The cost is estimated as extract elements at 0, 2, 4, 6 from the
      // <8 x i32> vector and insert them into a <4 x i32> vector.

      assert(Indices.size() <= Factor &&
             "Interleaved memory op has too many members");
1028

1029
1030
1031
1032
1033
      for (unsigned Index : Indices) {
        assert(Index < Factor && "Invalid index for interleaved memory op");

        // Extract elements from loaded vector for each sub vector.
        for (unsigned i = 0; i < NumSubElts; i++)
1034
1035
          Cost += static_cast<T *>(this)->getVectorInstrCost(
              Instruction::ExtractElement, VT, Index + i * Factor);
1036
1037
1038
1039
      }

      unsigned InsSubCost = 0;
      for (unsigned i = 0; i < NumSubElts; i++)
1040
1041
        InsSubCost += static_cast<T *>(this)->getVectorInstrCost(
            Instruction::InsertElement, SubVT, i);
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055

      Cost += Indices.size() * InsSubCost;
    } else {
      // The interleave cost is extract all elements from sub vectors, and
      // insert them into the wide vector.
      //
      // E.g. An interleaved store of factor 2:
      //      %v0_v1 = shuffle %v0, %v1, <0, 4, 1, 5, 2, 6, 3, 7>
      //      store <8 x i32> %interleaved.vec, <8 x i32>* %ptr
      // The cost is estimated as extract all elements from both <4 x i32>
      // vectors and insert into the <8 x i32> vector.

      unsigned ExtSubCost = 0;
      for (unsigned i = 0; i < NumSubElts; i++)
1056
1057
1058
        ExtSubCost += static_cast<T *>(this)->getVectorInstrCost(
            Instruction::ExtractElement, SubVT, i);
      Cost += ExtSubCost * Factor;
1059
1060

      for (unsigned i = 0; i < NumElts; i++)
1061
1062
        Cost += static_cast<T *>(this)
                    ->getVectorInstrCost(Instruction::InsertElement, VT, i);
1063
1064
    }

1065
    if (!UseMaskForCond)
1066
1067
1068
      return Cost;

    Type *I8Type = Type::getInt8Ty(VT->getContext());
1069
1070
    auto *MaskVT = FixedVectorType::get(I8Type, NumElts);
    SubVT = FixedVectorType::get(I8Type, NumSubElts);
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089

    // The Mask shuffling cost is extract all the elements of the Mask
    // and insert each of them Factor times into the wide vector:
    //
    // E.g. an interleaved group with factor 3:
    //    %mask = icmp ult <8 x i32> %vec1, %vec2
    //    %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef,
    //        <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7>
    // The cost is estimated as extract all mask elements from the <8xi1> mask
    // vector and insert them factor times into the <24xi1> shuffled mask
    // vector.
    for (unsigned i = 0; i < NumSubElts; i++)
      Cost += static_cast<T *>(this)->getVectorInstrCost(
          Instruction::ExtractElement, SubVT, i);

    for (unsigned i = 0; i < NumElts; i++)
      Cost += static_cast<T *>(this)->getVectorInstrCost(
          Instruction::InsertElement, MaskVT, i);

1090
1091
1092
1093
1094
1095
1096
    // The Gaps mask is invariant and created outside the loop, therefore the
    // cost of creating it is not accounted for here. However if we have both
    // a MaskForGaps and some other mask that guards the execution of the
    // memory access, we need to account for the cost of And-ing the two masks
    // inside the loop.
    if (UseMaskForGaps)
      Cost += static_cast<T *>(this)->getArithmeticInstrCost(
1097
          BinaryOperator::And, MaskVT, CostKind);
1098

1099
1100
1101
    return Cost;
  }

1102
  /// Get intrinsic cost based on arguments.
1103
1104
  unsigned getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
                                 TTI::TargetCostKind CostKind) {
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
    Intrinsic::ID IID = ICA.getID();
    auto *ConcreteTTI = static_cast<T *>(this);

    // Special case some scalar intrinsics.
    if (CostKind != TTI::TCK_RecipThroughput) {
      switch (IID) {
      default:
        break;
      case Intrinsic::cttz:
        if (getTLI()->isCheapToSpeculateCttz())
          return TargetTransformInfo::TCC_Basic;
        break;
      case Intrinsic::ctlz:
        if (getTLI()->isCheapToSpeculateCtlz())
          return TargetTransformInfo::TCC_Basic;
        break;
      case Intrinsic::memcpy:
        return ConcreteTTI->getMemcpyCost(ICA.getInst());
      // TODO: other libc intrinsics.
      }
      return BaseT::getIntrinsicInstrCost(ICA, CostKind);
    }
1127

1128
1129
1130
    if (BaseT::getIntrinsicInstrCost(ICA, CostKind) == 0)
      return 0;

1131
1132
1133
1134
    // TODO: Combine these two logic paths.
    if (ICA.isTypeBasedOnly())
      return getTypeBasedIntrinsicInstrCost(ICA, CostKind);

1135
1136
    Type *RetTy = ICA.getReturnType();
    unsigned VF = ICA.getVectorFactor();
1137
    unsigned RetVF =
1138
1139
        (RetTy->isVectorTy() ? cast<FixedVectorType>(RetTy)->getNumElements()
                             : 1);
Eugene Zelenko's avatar