TargetTransformInfo.h 95.6 KB
Newer Older
1
//===- TargetTransformInfo.h ------------------------------------*- C++ -*-===//
2
//
3
4
5
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
7
//
//===----------------------------------------------------------------------===//
8
9
10
11
12
13
14
15
16
17
18
/// \file
/// This pass exposes codegen information to IR-level passes. Every
/// transformation that uses codegen information is broken into three parts:
/// 1. The IR-level analysis pass.
/// 2. The IR-level transformation interface which provides the needed
///    information.
/// 3. Codegen-level implementation which uses target-specific hooks.
///
/// This file defines #2, which is the interface that IR-level transformations
/// use for querying the codegen.
///
19
20
//===----------------------------------------------------------------------===//

21
22
#ifndef LLVM_ANALYSIS_TARGETTRANSFORMINFO_H
#define LLVM_ANALYSIS_TARGETTRANSFORMINFO_H
23

24
#include "llvm/IR/Operator.h"
25
#include "llvm/IR/PassManager.h"
26
#include "llvm/Pass.h"
27
#include "llvm/Support/AtomicOrdering.h"
28
#include "llvm/Support/DataTypes.h"
29
#include <functional>
30
31
32

namespace llvm {

33
namespace Intrinsic {
34
typedef unsigned ID;
35
36
}

37
class AssumptionCache;
38
class BlockFrequencyInfo;
Florian Hahn's avatar
Florian Hahn committed
39
class DominatorTree;
40
class BranchInst;
41
class CallBase;
42
class ExtractElementInst;
43
class Function;
44
class GlobalValue;
45
46
class IntrinsicInst;
class LoadInst;
47
class LoopAccessInfo;
48
class Loop;
Florian Hahn's avatar
Florian Hahn committed
49
class LoopInfo;
50
class ProfileSummaryInfo;
51
class SCEV;
52
53
54
class ScalarEvolution;
class StoreInst;
class SwitchInst;
55
class TargetLibraryInfo;
56
57
58
class Type;
class User;
class Value;
Florian Hahn's avatar
Florian Hahn committed
59
template <typename T> class Optional;
60

61
/// Information about a load/store intrinsic defined by the target.
62
struct MemIntrinsicInfo {
63
64
65
66
  /// This is the pointer that the intrinsic is loading from or storing to.
  /// If this is non-null, then analysis/optimization passes can assume that
  /// this intrinsic is functionally equivalent to a load/store from this
  /// pointer.
67
68
69
70
71
72
73
74
75
76
77
78
79
80
  Value *PtrVal = nullptr;

  // Ordering for atomic operations.
  AtomicOrdering Ordering = AtomicOrdering::NotAtomic;

  // Same Id is set by the target for corresponding load/store intrinsics.
  unsigned short MatchingId = 0;

  bool ReadMem = false;
  bool WriteMem = false;
  bool IsVolatile = false;

  bool isUnordered() const {
    return (Ordering == AtomicOrdering::NotAtomic ||
81
82
            Ordering == AtomicOrdering::Unordered) &&
           !IsVolatile;
83
  }
84
85
};

86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
/// Attributes of a target dependent hardware loop.
struct HardwareLoopInfo {
  HardwareLoopInfo() = delete;
  HardwareLoopInfo(Loop *L) : L(L) {}
  Loop *L = nullptr;
  BasicBlock *ExitBlock = nullptr;
  BranchInst *ExitBranch = nullptr;
  const SCEV *ExitCount = nullptr;
  IntegerType *CountType = nullptr;
  Value *LoopDecrement = nullptr; // Decrement the loop counter by this
                                  // value in every iteration.
  bool IsNestingLegal = false;    // Can a hardware loop be a parent to
                                  // another hardware loop?
  bool CounterInReg = false;      // Should loop counter be updated in
                                  // the loop via a phi?
101
102
103
  bool PerformEntryTest = false;  // Generate the intrinsic which also performs
                                  // icmp ne zero on the loop counter value and
                                  // produces an i1 to guard the loop entry.
104
105
  bool isHardwareLoopCandidate(ScalarEvolution &SE, LoopInfo &LI,
                               DominatorTree &DT, bool ForceNestedLoop = false,
106
                               bool ForceHardwareLoopPHI = false);
107
  bool canAnalyze(LoopInfo &LI);
108
109
};

110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
class IntrinsicCostAttributes {
  const IntrinsicInst *II = nullptr;
  Type *RetTy = nullptr;
  Intrinsic::ID IID;
  SmallVector<Type *, 4> ParamTys;
  SmallVector<Value *, 4> Arguments;
  FastMathFlags FMF;
  unsigned VF = 1;
  // If ScalarizationCost is UINT_MAX, the cost of scalarizing the
  // arguments and the return value will be computed based on types.
  unsigned ScalarizationCost = std::numeric_limits<unsigned>::max();

public:
  IntrinsicCostAttributes(const IntrinsicInst &I);

125
126
127
  IntrinsicCostAttributes(Intrinsic::ID Id, const CallBase &CI);

  IntrinsicCostAttributes(Intrinsic::ID Id, const CallBase &CI,
128
129
                          unsigned Factor);

130
  IntrinsicCostAttributes(Intrinsic::ID Id, const CallBase &CI,
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
                          unsigned Factor, unsigned ScalarCost);

  IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy,
                          ArrayRef<Type *> Tys, FastMathFlags Flags);

  IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy,
                          ArrayRef<Type *> Tys, FastMathFlags Flags,
                          unsigned ScalarCost);

  IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy,
                          ArrayRef<Type *> Tys, FastMathFlags Flags,
                          unsigned ScalarCost,
                          const IntrinsicInst *I);

  IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy,
                          ArrayRef<Type *> Tys);

148
  IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy,
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
                          ArrayRef<Value *> Args);

  Intrinsic::ID getID() const { return IID; }
  const IntrinsicInst *getInst() const { return II; }
  Type *getReturnType() const { return RetTy; }
  unsigned getVectorFactor() const { return VF; }
  FastMathFlags getFlags() const { return FMF; }
  unsigned getScalarizationCost() const { return ScalarizationCost; }
  const SmallVectorImpl<Value *> &getArgs() const { return Arguments; }
  const SmallVectorImpl<Type *> &getArgTypes() const { return ParamTys; }

  bool isTypeBasedOnly() const {
    return Arguments.empty();
  }

  bool skipScalarizationCost() const {
    return ScalarizationCost != std::numeric_limits<unsigned>::max();
  }
};

169
170
171
class TargetTransformInfo;
typedef TargetTransformInfo TTI;

172
/// This pass provides access to the codegen interfaces that are needed
173
/// for IR-level transformations.
174
class TargetTransformInfo {
175
public:
176
  /// Construct a TTI object using a type implementing the \c Concept
177
  /// API below.
178
  ///
179
  /// This is used by targets to construct a TTI wrapping their target-specific
180
  /// implementation that encodes appropriate costs for their target.
181
  template <typename T> TargetTransformInfo(T Impl);
182

183
  /// Construct a baseline TTI object using a minimal implementation of
184
185
186
187
  /// the \c Concept API below.
  ///
  /// The TTI implementation will reflect the information in the DataLayout
  /// provided if non-null.
188
  explicit TargetTransformInfo(const DataLayout &DL);
189

190
191
192
  // Provide move semantics.
  TargetTransformInfo(TargetTransformInfo &&Arg);
  TargetTransformInfo &operator=(TargetTransformInfo &&RHS);
193

194
195
196
  // We need to define the destructor out-of-line to define our sub-classes
  // out-of-line.
  ~TargetTransformInfo();
197

198
  /// Handle the invalidation of this information.
199
200
201
202
  ///
  /// When used as a result of \c TargetIRAnalysis this method will be called
  /// when the function this was computed for changes. When it returns false,
  /// the information is preserved across those changes.
203
204
  bool invalidate(Function &, const PreservedAnalyses &,
                  FunctionAnalysisManager::Invalidator &) {
205
206
207
208
209
    // FIXME: We should probably in some way ensure that the subtarget
    // information for a function hasn't changed.
    return false;
  }

210
211
212
  /// \name Generic Target Information
  /// @{

213
  /// The kind of cost model.
214
215
216
217
218
219
  ///
  /// There are several different cost models that can be customized by the
  /// target. The normalization of each cost model may be target specific.
  enum TargetCostKind {
    TCK_RecipThroughput, ///< Reciprocal throughput.
    TCK_Latency,         ///< The latency of instruction.
220
221
    TCK_CodeSize,        ///< Instruction code size.
    TCK_SizeAndLatency   ///< The weighted sum of size and latency.
222
223
  };

224
  /// Query the cost of a specified instruction.
225
226
227
228
229
230
231
  ///
  /// Clients should use this interface to query the cost of an existing
  /// instruction. The instruction must have a valid parent (basic block).
  ///
  /// Note, this method does not cache the cost calculation and it
  /// can be expensive in some cases.
  int getInstructionCost(const Instruction *I, enum TargetCostKind kind) const {
232
    switch (kind) {
233
234
235
236
237
238
239
    case TCK_RecipThroughput:
      return getInstructionThroughput(I);

    case TCK_Latency:
      return getInstructionLatency(I);

    case TCK_CodeSize:
240
241
    case TCK_SizeAndLatency:
      return getUserCost(I, kind);
242
    }
243
    llvm_unreachable("Unknown instruction cost kind");
244
245
  }

246
  /// Underlying constants for 'cost' values in this interface.
247
248
249
  ///
  /// Many APIs in this interface return a cost. This enum defines the
  /// fundamental values that should be used to interpret (and produce) those
250
  /// costs. The costs are returned as an int rather than a member of this
251
  /// enumeration because it is expected that the cost of one IR instruction
Jakub Staszak's avatar
Jakub Staszak committed
252
  /// may have a multiplicative factor to it or otherwise won't fit directly
253
254
  /// into the enum. Moreover, it is common to sum or average costs which works
  /// better as simple integral values. Thus this enum only provides constants.
255
256
257
  /// Also note that the returned costs are signed integers to make it natural
  /// to add, subtract, and test with zero (a common boundary condition). It is
  /// not expected that 2^32 is a realistic cost to be modeling at any point.
258
259
260
261
262
263
264
  ///
  /// Note that these costs should usually reflect the intersection of code-size
  /// cost and execution cost. A free instruction is typically one that folds
  /// into another instruction. For example, reg-to-reg moves can often be
  /// skipped by renaming the registers in the CPU, but they still are encoded
  /// and thus wouldn't be considered 'free' here.
  enum TargetCostConstants {
265
266
267
    TCC_Free = 0,     ///< Expected to fold away in lowering.
    TCC_Basic = 1,    ///< The cost of a typical 'add' instruction.
    TCC_Expensive = 4 ///< The cost of a 'div' instruction on x86.
268
269
  };

270
  /// Estimate the cost of a GEP operation when lowered.
271
  int getGEPCost(Type *PointeeType, const Value *Ptr,
272
273
                 ArrayRef<const Value *> Operands,
                 TargetCostKind CostKind = TCK_SizeAndLatency) const;
274

275
276
277
278
279
280
281
282
  /// \returns A value by which our inlining threshold should be multiplied.
  /// This is primarily used to bump up the inlining threshold wholesale on
  /// targets where calls are unusually expensive.
  ///
  /// TODO: This is a rather blunt instrument.  Perhaps altering the costs of
  /// individual classes of instructions would be better.
  unsigned getInliningThresholdMultiplier() const;

283
284
285
286
  /// \returns Vector bonus in percent.
  ///
  /// Vector bonuses: We want to more aggressively inline vector-dense kernels
  /// and apply this bonus based on the percentage of vector instructions. A
287
288
  /// bonus is applied if the vector instructions exceed 50% and half that
  /// amount is applied if it exceeds 10%. Note that these bonuses are some what
289
290
291
292
293
294
  /// arbitrary and evolved over time by accident as much as because they are
  /// principled bonuses.
  /// FIXME: It would be nice to base the bonus values on something more
  /// scientific. A target may has no bonus on vector instructions.
  int getInlinerVectorBonusPercent() const;

295
  /// \return the expected cost of a memcpy, which could e.g. depend on the
Sjoerd Meijer's avatar
Sjoerd Meijer committed
296
297
298
  /// source/destination type and alignment and the number of bytes copied.
  int getMemcpyCost(const Instruction *I) const;

299
300
301
302
  /// \return The estimated number of case clusters when lowering \p 'SI'.
  /// \p JTSize Set a jump table size only when \p SI is suitable for a jump
  /// table.
  unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
303
304
305
                                            unsigned &JTSize,
                                            ProfileSummaryInfo *PSI,
                                            BlockFrequencyInfo *BFI) const;
306

307
  /// Estimate the cost of a given IR user when lowered.
308
309
  ///
  /// This can estimate the cost of either a ConstantExpr or Instruction when
Sam Parker's avatar
Sam Parker committed
310
  /// lowered.
311
  ///
312
313
314
315
316
317
  /// \p Operands is a list of operands which can be a result of transformations
  /// of the current operands. The number of the operands on the list must equal
  /// to the number of the current operands the IR user has. Their order on the
  /// list must be the same as the order of the current operands the IR user
  /// has.
  ///
318
319
  /// The returned cost is defined in terms of \c TargetCostConstants, see its
  /// comments for a detailed explanation of the cost values.
320
321
  int getUserCost(const User *U, ArrayRef<const Value *> Operands,
                  TargetCostKind CostKind) const;
322

323
  /// This is a helper function which calls the two-argument getUserCost
324
  /// with \p Operands which are the current operands U has.
325
  int getUserCost(const User *U, TargetCostKind CostKind) const {
326
327
    SmallVector<const Value *, 4> Operands(U->value_op_begin(),
                                           U->value_op_end());
328
    return getUserCost(U, Operands, CostKind);
329
  }
330

331
  /// Return true if branch divergence exists.
332
  ///
333
334
335
  /// Branch divergence has a significantly negative impact on GPU performance
  /// when threads in the same wavefront take different paths due to conditional
  /// branches.
336
  bool hasBranchDivergence() const;
337

338
339
340
341
  /// Return true if the target prefers to use GPU divergence analysis to
  /// replace the legacy version.
  bool useGPUDivergenceAnalysis() const;

342
  /// Returns whether V is a source of divergence.
343
344
  ///
  /// This function provides the target-dependent information for
345
346
347
  /// the target-independent LegacyDivergenceAnalysis. LegacyDivergenceAnalysis
  /// first builds the dependency graph, and then runs the reachability
  /// algorithm starting with the sources of divergence.
348
349
  bool isSourceOfDivergence(const Value *V) const;

350
  // Returns true for the target specific
351
  // set of operations which produce uniform result
352
  // even taking non-uniform arguments
353
354
  bool isAlwaysUniform(const Value *V) const;

355
356
357
358
359
360
361
362
363
  /// Returns the address space ID for a target's 'flat' address space. Note
  /// this is not necessarily the same as addrspace(0), which LLVM sometimes
  /// refers to as the generic address space. The flat address space is a
  /// generic address space that can be used access multiple segments of memory
  /// with different address spaces. Access of a memory location through a
  /// pointer with this address space is expected to be legal but slower
  /// compared to the same memory location accessed through a pointer with a
  /// different address space.
  //
364
  /// This is for targets with different pointer representations which can
365
366
367
368
369
370
371
372
  /// be converted with the addrspacecast instruction. If a pointer is converted
  /// to this address space, optimizations should attempt to replace the access
  /// with the source address space.
  ///
  /// \returns ~0u if the target does not have such a flat address space to
  /// optimize away.
  unsigned getFlatAddressSpace() const;

373
374
375
376
377
378
379
380
381
382
  /// Return any intrinsic address operand indexes which may be rewritten if
  /// they use a flat address space pointer.
  ///
  /// \returns true if the intrinsic was handled.
  bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
                                  Intrinsic::ID IID) const;

  /// Rewrite intrinsic call \p II such that \p OldV will be replaced with \p
  /// NewV, which has a different address space. This should happen for every
  /// operand index that collectFlatAddressOperands returned for the intrinsic.
383
384
385
386
  /// \returns nullptr if the intrinsic was not handled. Otherwise, returns the
  /// new value (which may be the original \p II with modified operands).
  Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
                                          Value *NewV) const;
387

388
  /// Test whether calls to a function lower to actual program function
389
390
391
392
393
394
395
396
397
  /// calls.
  ///
  /// The idea is to test whether the program is likely to require a 'call'
  /// instruction or equivalent in order to call the given function.
  ///
  /// FIXME: It's not clear that this is a good or useful query API. Client's
  /// should probably move to simpler cost metrics using the above.
  /// Alternatively, we could split the cost interface into distinct code-size
  /// and execution-speed costs. This would allow modelling the core of this
Robin Morisset's avatar
Robin Morisset committed
398
  /// query more accurately as a call is a single small instruction, but
399
  /// incurs significant execution cost.
400
  bool isLoweredToCall(const Function *F) const;
401

402
403
404
405
406
407
408
409
410
411
412
413
414
  struct LSRCost {
    /// TODO: Some of these could be merged. Also, a lexical ordering
    /// isn't always optimal.
    unsigned Insns;
    unsigned NumRegs;
    unsigned AddRecCost;
    unsigned NumIVMuls;
    unsigned NumBaseAdds;
    unsigned ImmCost;
    unsigned SetupCost;
    unsigned ScaleCost;
  };

415
416
  /// Parameters that control the generic loop unrolling transformation.
  struct UnrollingPreferences {
417
418
419
420
421
422
    /// The cost threshold for the unrolled loop. Should be relative to the
    /// getUserCost values returned by this API, and the expectation is that
    /// the unrolled loop's instructions when run through that interface should
    /// not exceed this cost. However, this is only an estimate. Also, specific
    /// loops may be unrolled even with a cost above this threshold if deemed
    /// profitable. Set this to UINT_MAX to disable the loop body cost
423
424
    /// restriction.
    unsigned Threshold;
425
426
427
428
429
430
431
432
433
434
435
    /// If complete unrolling will reduce the cost of the loop, we will boost
    /// the Threshold by a certain percent to allow more aggressive complete
    /// unrolling. This value provides the maximum boost percentage that we
    /// can apply to Threshold (The value should be no less than 100).
    /// BoostedThreshold = Threshold * min(RolledCost / UnrolledCost,
    ///                                    MaxPercentThresholdBoost / 100)
    /// E.g. if complete unrolling reduces the loop execution time by 50%
    /// then we boost the threshold by the factor of 2x. If unrolling is not
    /// expected to reduce the running time, then we do not increase the
    /// threshold.
    unsigned MaxPercentThresholdBoost;
436
437
438
    /// The cost threshold for the unrolled loop when optimizing for size (set
    /// to UINT_MAX to disable).
    unsigned OptSizeThreshold;
439
440
441
442
    /// The cost threshold for the unrolled loop, like Threshold, but used
    /// for partial/runtime unrolling (set to UINT_MAX to disable).
    unsigned PartialThreshold;
    /// The cost threshold for the unrolled loop when optimizing for size, like
443
444
    /// OptSizeThreshold, but used for partial/runtime unrolling (set to
    /// UINT_MAX to disable).
445
    unsigned PartialOptSizeThreshold;
446
447
448
449
450
    /// A forced unrolling factor (the number of concatenated bodies of the
    /// original loop in the unrolled loop body). When set to 0, the unrolling
    /// transformation will select an unrolling factor based on the current cost
    /// threshold and other factors.
    unsigned Count;
451
452
453
454
455
    /// A forced peeling factor (the number of bodied of the original loop
    /// that should be peeled off before the loop body). When set to 0, the
    /// unrolling transformation will select a peeling factor based on profile
    /// information and other factors.
    unsigned PeelCount;
456
457
    /// Default unroll count for loops with run-time trip count.
    unsigned DefaultUnrollRuntimeCount;
458
459
460
461
462
    // Set the maximum unrolling factor. The unrolling factor may be selected
    // using the appropriate cost threshold, but may not exceed this number
    // (set to UINT_MAX to disable). This does not apply in cases where the
    // loop is being fully unrolled.
    unsigned MaxCount;
463
464
465
466
    /// Set the maximum unrolling factor for full unrolling. Like MaxCount, but
    /// applies even if full unrolling is selected. This allows a target to fall
    /// back to Partial unrolling if full unrolling is above FullUnrollMaxCount.
    unsigned FullUnrollMaxCount;
467
468
469
470
471
    // Represents number of instructions optimized when "back edge"
    // becomes "fall through" in unrolled loop.
    // For now we count a conditional branch on a backedge and a comparison
    // feeding it.
    unsigned BEInsns;
472
473
    /// Allow partial unrolling (unrolling of loops to expand the size of the
    /// loop body, not only to eliminate small constant-trip-count loops).
474
    bool Partial;
475
    /// Allow runtime unrolling (unrolling of loops to expand the size of the
476
477
478
    /// loop body even when the number of loop iterations is not known at
    /// compile time).
    bool Runtime;
479
480
    /// Allow generation of a loop remainder (extra iterations after unroll).
    bool AllowRemainder;
481
482
483
    /// Allow emitting expensive instructions (such as divisions) when computing
    /// the trip count of a loop for runtime unrolling.
    bool AllowExpensiveTripCount;
484
485
486
    /// Apply loop unroll on any kind of loop
    /// (mainly to loops that fail runtime unrolling).
    bool Force;
487
488
    /// Allow using trip count upper bound to unroll loops.
    bool UpperBound;
489
    /// Allow peeling off loop iterations.
490
    bool AllowPeeling;
491
492
    /// Allow peeling off loop iterations for loop nests.
    bool AllowLoopNestsPeeling;
493
494
    /// Allow unrolling of all the iterations of the runtime loop remainder.
    bool UnrollRemainder;
495
496
    /// Allow unroll and jam. Used to enable unroll and jam for the target.
    bool UnrollAndJam;
497
498
499
500
501
    /// Allow peeling basing on profile. Uses to enable peeling off all
    /// iterations basing on provided profile.
    /// If the value is true the peeling cost model can decide to peel only
    /// some iterations and in this case it will set this to false.
    bool PeelProfiledIterations;
502
503
504
505
506
    /// Threshold for unroll and jam, for inner loop size. The 'Threshold'
    /// value above is used during unroll and jam for the outer loop size.
    /// This value is used in the same manner to limit the size of the inner
    /// loop.
    unsigned UnrollAndJamInnerLoopThreshold;
507
508
509
    /// Don't allow loop unrolling to simulate more than this number of
    /// iterations when checking full unroll profitability
    unsigned MaxIterationsCountToAnalyze;
510
511
  };

512
  /// Get target-customized preferences for the generic loop unrolling
513
514
  /// transformation. The caller will initialize UP with the current
  /// target-independent defaults.
515
516
  void getUnrollingPreferences(Loop *L, ScalarEvolution &,
                               UnrollingPreferences &UP) const;
517

518
519
  /// Query the target whether it would be profitable to convert the given loop
  /// into a hardware loop.
520
  bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
521
                                AssumptionCache &AC, TargetLibraryInfo *LibInfo,
522
523
                                HardwareLoopInfo &HWLoopInfo) const;

524
525
  /// Query the target whether it would be prefered to create a predicated
  /// vector loop, which can avoid the need to emit a scalar epilogue loop.
526
527
528
529
530
  bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
                                   AssumptionCache &AC, TargetLibraryInfo *TLI,
                                   DominatorTree *DT,
                                   const LoopAccessInfo *LAI) const;

531
  /// Query the target whether lowering of the llvm.get.active.lane.mask
532
533
  /// intrinsic is supported.
  bool emitGetActiveLaneMask() const;
534

535
536
  /// @}

537
538
539
  /// \name Scalar Target Information
  /// @{

540
  /// Flags indicating the kind of support for population count.
541
542
543
544
545
  ///
  /// Compared to the SW implementation, HW support is supposed to
  /// significantly boost the performance when the population is dense, and it
  /// may or may not degrade performance if the population is sparse. A HW
  /// support is considered as "Fast" if it can outperform, or is on a par
Jakub Staszak's avatar
Jakub Staszak committed
546
  /// with, SW implementation when the population is sparse; otherwise, it is
547
  /// considered as "Slow".
548
  enum PopcntSupportKind { PSK_Software, PSK_SlowHardware, PSK_FastHardware };
549

550
  /// Return true if the specified immediate is legal add immediate, that
Juergen Ributzka's avatar
Juergen Ributzka committed
551
552
  /// is the target has add instructions which can add a register with the
  /// immediate without having to materialize the immediate into a register.
553
  bool isLegalAddImmediate(int64_t Imm) const;
554

555
  /// Return true if the specified immediate is legal icmp immediate,
Juergen Ributzka's avatar
Juergen Ributzka committed
556
557
558
  /// that is the target has icmp instructions which can compare a register
  /// against the immediate without having to materialize the immediate into a
  /// register.
559
  bool isLegalICmpImmediate(int64_t Imm) const;
560

561
  /// Return true if the addressing mode represented by AM is legal for
Juergen Ributzka's avatar
Juergen Ributzka committed
562
  /// this target, for a load/store of the specified type.
563
564
  /// The type may be VoidTy, in which case only return true if the addressing
  /// mode is legal for a load/store of any legal type.
Jonas Paulsson's avatar
Jonas Paulsson committed
565
  /// If target returns true in LSRWithInstrQueries(), I may be valid.
566
  /// TODO: Handle pre/postinc as well.
567
  bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
568
                             bool HasBaseReg, int64_t Scale,
Jonas Paulsson's avatar
Jonas Paulsson committed
569
570
                             unsigned AddrSpace = 0,
                             Instruction *I = nullptr) const;
571

572
  /// Return true if LSR cost of C1 is lower than C1.
573
574
575
  bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
                     TargetTransformInfo::LSRCost &C2) const;

576
577
578
  /// \returns true if LSR should not optimize a chain that includes \p I.
  bool isProfitableLSRChainElement(Instruction *I) const;

579
580
581
582
583
  /// Return true if the target can fuse a compare and branch.
  /// Loop-strength-reduction (LSR) uses that knowledge to adjust its cost
  /// calculation for the instructions in a loop.
  bool canMacroFuseCmp() const;

584
585
586
587
588
589
  /// Return true if the target can save a compare for loop count, for example
  /// hardware loop saves a compare.
  bool canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE, LoopInfo *LI,
                  DominatorTree *DT, AssumptionCache *AC,
                  TargetLibraryInfo *LibInfo) const;

590
591
592
593
  /// \return True is LSR should make efforts to create/preserve post-inc
  /// addressing mode expressions.
  bool shouldFavorPostInc() const;

594
595
596
597
  /// Return true if LSR should make efforts to generate indexed addressing
  /// modes that operate across loop iterations.
  bool shouldFavorBackedgeIndex(const Loop *L) const;

598
  /// Return true if the target supports masked store.
599
  bool isLegalMaskedStore(Type *DataType, Align Alignment) const;
600
  /// Return true if the target supports masked load.
601
  bool isLegalMaskedLoad(Type *DataType, Align Alignment) const;
602

603
  /// Return true if the target supports nontemporal store.
604
  bool isLegalNTStore(Type *DataType, Align Alignment) const;
605
  /// Return true if the target supports nontemporal load.
606
  bool isLegalNTLoad(Type *DataType, Align Alignment) const;
607

608
  /// Return true if the target supports masked scatter.
609
  bool isLegalMaskedScatter(Type *DataType, Align Alignment) const;
610
  /// Return true if the target supports masked gather.
611
  bool isLegalMaskedGather(Type *DataType, Align Alignment) const;
612

613
614
615
616
617
  /// Return true if the target supports masked compress store.
  bool isLegalMaskedCompressStore(Type *DataType) const;
  /// Return true if the target supports masked expand load.
  bool isLegalMaskedExpandLoad(Type *DataType) const;

618
619
620
621
622
623
624
  /// Return true if the target has a unified operation to calculate division
  /// and remainder. If so, the additional implicit multiplication and
  /// subtraction required to calculate a remainder from division are free. This
  /// can enable more aggressive transformations for division and remainder than
  /// would typically be allowed using throughput or size cost models.
  bool hasDivRemOp(Type *DataType, bool IsSigned) const;

625
626
627
628
629
630
631
  /// Return true if the given instruction (assumed to be a memory access
  /// instruction) has a volatile variant. If that's the case then we can avoid
  /// addrspacecast to generic AS for volatile loads/stores. Default
  /// implementation returns false, which prevents address space inference for
  /// volatile loads/stores.
  bool hasVolatileVariant(Instruction *I, unsigned AddrSpace) const;

632
633
634
  /// Return true if target doesn't mind addresses in vectors.
  bool prefersVectorizedAddressing() const;

635
  /// Return the cost of the scaling factor used in the addressing
636
637
638
639
640
  /// mode represented by AM for this target, for a load/store
  /// of the specified type.
  /// If the AM is supported, the return value must be >= 0.
  /// If the AM is not supported, it returns a negative value.
  /// TODO: Handle pre/postinc as well.
641
  int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
642
643
                           bool HasBaseReg, int64_t Scale,
                           unsigned AddrSpace = 0) const;
644

645
  /// Return true if the loop strength reduce pass should make
Jonas Paulsson's avatar
Jonas Paulsson committed
646
647
648
649
650
  /// Instruction* based TTI queries to isLegalAddressingMode(). This is
  /// needed on SystemZ, where e.g. a memcpy can only have a 12 bit unsigned
  /// immediate offset and no index register.
  bool LSRWithInstrQueries() const;

651
  /// Return true if it's free to truncate a value of type Ty1 to type
Juergen Ributzka's avatar
Juergen Ributzka committed
652
653
  /// Ty2. e.g. On x86 it's free to truncate a i32 value in register EAX to i16
  /// by referencing its sub-register AX.
654
  bool isTruncateFree(Type *Ty1, Type *Ty2) const;
655

656
  /// Return true if it is profitable to hoist instruction in the
657
658
659
  /// then/else to before if.
  bool isProfitableToHoist(Instruction *I) const;

660
661
  bool useAA() const;

662
  /// Return true if this type is legal.
663
  bool isTypeLegal(Type *Ty) const;
664

665
  /// Return true if switches should be turned into lookup tables for the
Juergen Ributzka's avatar
Juergen Ributzka committed
666
  /// target.
667
  bool shouldBuildLookupTables() const;
668

669
  /// Return true if switches should be turned into lookup tables
670
671
672
  /// containing this constant value for the target.
  bool shouldBuildLookupTablesForConstant(Constant *C) const;

673
  /// Return true if the input function which is cold at all call sites,
674
675
676
  ///  should use coldcc calling convention.
  bool useColdCCForColdCall(Function &F) const;

677
678
679
  /// Estimate the overhead of scalarizing an instruction. Insert and Extract
  /// are set if the demanded result elements need to be inserted and/or
  /// extracted from vectors.
680
  unsigned getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts,
681
682
683
684
685
                                    bool Insert, bool Extract) const;

  /// Estimate the overhead of scalarizing an instructions unique
  /// non-constant operands. The types of the arguments are ordinarily
  /// scalar, in which case the costs are multiplied with VF.
686
687
688
  unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
                                            unsigned VF) const;

689
690
691
692
693
  /// If target has efficient vector element load/store instructions, it can
  /// return true here so that insertion/extraction costs are not added to
  /// the scalarization cost of a load/store.
  bool supportsEfficientVectorElementLoadStore() const;

694
  /// Don't restrict interleaved unrolling to small loops.
695
696
  bool enableAggressiveInterleaving(bool LoopHasReductions) const;

697
698
  /// Returns options for expansion of memcmp. IsZeroCmp is
  // true if this is the expansion of memcmp(p1, p2, s) == 0.
699
  struct MemCmpExpansionOptions {
700
701
702
703
704
705
    // Return true if memcmp expansion is enabled.
    operator bool() const { return MaxNumLoads > 0; }

    // Maximum number of load operations.
    unsigned MaxNumLoads = 0;

706
707
    // The list of available load sizes (in bytes), sorted in decreasing order.
    SmallVector<unsigned, 8> LoadSizes;
708
709
710
711
712
713
714
715
716
717
718

    // For memcmp expansion when the memcmp result is only compared equal or
    // not-equal to 0, allow up to this number of load pairs per block. As an
    // example, this may allow 'memcmp(a, b, 3) == 0' in a single block:
    //   a0 = load2bytes &a[0]
    //   b0 = load2bytes &b[0]
    //   a2 = load1byte  &a[2]
    //   b2 = load1byte  &b[2]
    //   r  = cmp eq (a0 ^ b0 | a2 ^ b2), 0
    unsigned NumLoadsPerBlock = 1;

719
720
721
722
    // Set to true to allow overlapping loads. For example, 7-byte compares can
    // be done with two 4-byte compares instead of 4+2+1-byte compares. This
    // requires all loads in LoadSizes to be doable in an unaligned way.
    bool AllowOverlappingLoads = false;
723
  };
724
725
  MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
                                               bool IsZeroCmp) const;
Zaara Syeda's avatar
Zaara Syeda committed
726

727
  /// Enable matching of interleaved access groups.
728
729
  bool enableInterleavedAccessVectorization() const;

730
  /// Enable matching of interleaved access groups that contain predicated
731
732
  /// accesses or gaps and therefore vectorized using masked
  /// vector loads/stores.
733
734
  bool enableMaskedInterleavedAccessVectorization() const;

735
  /// Indicate that it is potentially unsafe to automatically vectorize
736
737
738
739
740
741
742
743
  /// floating-point operations because the semantics of vector and scalar
  /// floating-point semantics may differ. For example, ARM NEON v7 SIMD math
  /// does not support IEEE-754 denormal numbers, while depending on the
  /// platform, scalar floating-point math does.
  /// This applies to floating-point math operations and calls, not memory
  /// operations, shuffles, or casts.
  bool isFPVectorizationPotentiallyUnsafe() const;

744
  /// Determine if the target supports unaligned memory accesses.
745
746
  bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth,
                                      unsigned AddressSpace = 0,
747
748
749
                                      unsigned Alignment = 1,
                                      bool *Fast = nullptr) const;

750
  /// Return hardware support for population count.
751
  PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) const;
752

753
  /// Return true if the hardware has a fast square-root instruction.
754
  bool haveFastSqrt(Type *Ty) const;
755

756
757
758
759
760
761
  /// Return true if it is faster to check if a floating-point value is NaN
  /// (or not-NaN) versus a comparison against a constant FP zero value.
  /// Targets should override this if materializing a 0.0 for comparison is
  /// generally as cheap as checking for ordered/unordered.
  bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) const;

762
  /// Return the expected cost of supporting the floating point operation
763
  /// of the specified type.
764
  int getFPOpCost(Type *Ty) const;
765

766
  /// Return the expected cost of materializing for the given integer
Juergen Ributzka's avatar
Juergen Ributzka committed
767
  /// immediate of the specified type.
768
  int getIntImmCost(const APInt &Imm, Type *Ty, TargetCostKind CostKind) const;
769

770
  /// Return the expected cost of materialization for the given integer
771
772
  /// immediate of the specified type for a given instruction. The cost can be
  /// zero if the immediate can be folded into the specified instruction.
773
  int getIntImmCostInst(unsigned Opc, unsigned Idx, const APInt &Imm,
774
                        Type *Ty, TargetCostKind CostKind) const;
775
  int getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
776
                          Type *Ty, TargetCostKind CostKind) const;
777

778
  /// Return the expected cost for the given integer when optimising
779
780
781
782
783
784
785
786
  /// for size. This is different than the other integer immediate cost
  /// functions in that it is subtarget agnostic. This is useful when you e.g.
  /// target one ISA such as Aarch32 but smaller encodings could be possible
  /// with another such as Thumb. This return value is used as a penalty when
  /// the total costs for a constant is calculated (the bigger the cost, the
  /// more beneficial constant hoisting is).
  int getIntImmCodeSizeCost(unsigned Opc, unsigned Idx, const APInt &Imm,
                            Type *Ty) const;
787
788
789
790
  /// @}

  /// \name Vector Target Information
  /// @{
791

792
  /// The various kinds of shuffle patterns for vector queries.
793
  enum ShuffleKind {
794
795
796
797
798
799
800
801
802
803
804
805
    SK_Broadcast,        ///< Broadcast element 0 to all other elements.
    SK_Reverse,          ///< Reverse the order of the vector.
    SK_Select,           ///< Selects elements from the corresponding lane of
                         ///< either source operand. This is equivalent to a
                         ///< vector select with a constant condition operand.
    SK_Transpose,        ///< Transpose two vectors.
    SK_InsertSubvector,  ///< InsertSubvector. Index indicates start offset.
    SK_ExtractSubvector, ///< ExtractSubvector Index indicates start offset.
    SK_PermuteTwoSrc,    ///< Merge elements from two source vectors into one
                         ///< with any shuffle mask.
    SK_PermuteSingleSrc  ///< Shuffle elements of single source vector with any
                         ///< shuffle mask.
806
807
  };

808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
  /// Kind of the reduction data.
  enum ReductionKind {
    RK_None,           /// Not a reduction.
    RK_Arithmetic,     /// Binary reduction data.
    RK_MinMax,         /// Min/max reduction data.
    RK_UnsignedMinMax, /// Unsigned min/max reduction data.
  };

  /// Contains opcode + LHS/RHS parts of the reduction operations.
  struct ReductionData {
    ReductionData() = delete;
    ReductionData(ReductionKind Kind, unsigned Opcode, Value *LHS, Value *RHS)
        : Opcode(Opcode), LHS(LHS), RHS(RHS), Kind(Kind) {
      assert(Kind != RK_None && "expected binary or min/max reduction only.");
    }
    unsigned Opcode = 0;
    Value *LHS = nullptr;
    Value *RHS = nullptr;
    ReductionKind Kind = RK_None;
    bool hasSameData(ReductionData &RD) const {
      return Kind == RD.Kind && Opcode == RD.Opcode;
    }
  };

  static ReductionKind matchPairwiseReduction(
    const ExtractElementInst *ReduxRoot, unsigned &Opcode, VectorType *&Ty);

  static ReductionKind matchVectorSplittingReduction(
    const ExtractElementInst *ReduxRoot, unsigned &Opcode, VectorType *&Ty);

838
  /// Additional information about an operand's possible values.
839
  enum OperandValueKind {
840
841
842
843
    OK_AnyValue,               // Operand can have any value.
    OK_UniformValue,           // Operand is uniform (splat of a value).
    OK_UniformConstantValue,   // Operand is uniform constant.
    OK_NonUniformConstantValue // Operand is a non uniform constant value.
844
845
  };

846
  /// Additional properties of an operand's values.
847
848
  enum OperandValueProperties { OP_None = 0, OP_PowerOf2 = 1 };

849
850
851
852
  /// \return the number of registers in the target-provided register class.
  unsigned getNumberOfRegisters(unsigned ClassID) const;

  /// \return the target-provided register class ID for the provided type,
853
854
855
856
857
858
859
860
  /// accounting for type promotion and other type-legalization techniques that
  /// the target might apply. However, it specifically does not account for the
  /// scalarization or splitting of vector types. Should a vector type require
  /// scalarization or splitting into multiple underlying vector registers, that
  /// type should be mapped to a register class containing no registers.
  /// Specifically, this is designed to provide a simple, high-level view of the
  /// register allocation later performed by the backend. These register classes
  /// don't necessarily map onto the register classes used by the backend.
861
862
863
864
865
  /// FIXME: It's not currently possible to determine how many registers
  /// are used by the provided type.
  unsigned getRegisterClassForType(bool Vector, Type *Ty = nullptr) const;

  /// \return the target-provided register class name
866
  const char *getRegisterClassName(unsigned ClassID) const;
Nadav Rotem's avatar
Nadav Rotem committed
867

868
  /// \return The width of the largest scalar or vector register type.
869
  unsigned getRegisterBitWidth(bool Vector) const;
870

871
872
873
  /// \return The width of the smallest vector register type.
  unsigned getMinVectorRegisterBitWidth() const;

874
875
876
877
878
879
880
881
  /// \return True if the vectorization factor should be chosen to
  /// make the vector of the smallest element type match the size of a
  /// vector register. For wider element types, this could result in
  /// creating vectors that span multiple vector registers.
  /// If false, the vectorization factor will be chosen based on the
  /// size of the widest element type.
  bool shouldMaximizeVectorBandwidth(bool OptSize) const;

882
  /// \return The minimum vectorization factor for types of given element
883
  /// bit width, or 0 if there is no minimum VF. The returned value only
884
885
886
  /// applies when shouldMaximizeVectorBandwidth returns true.
  unsigned getMinimumVF(unsigned ElemWidth) const;

887
888
889
890
891
892
  /// \return True if it should be considered for address type promotion.
  /// \p AllowPromotionWithoutCommonHeader Set true if promoting \p I is
  /// profitable without finding other extensions fed by the same input.
  bool shouldConsiderAddressTypePromotion(
      const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const;

Adam Nemet's avatar
Adam Nemet committed
893
894
895
  /// \return The size of a cache line in bytes.
  unsigned getCacheLineSize() const;

896
897
  /// The possible cache levels
  enum class CacheLevel {
898
899
    L1D, // The L1 data cache
    L2D, // The L2 data cache
900
901
902
903
904
905
906

    // We currently do not model L3 caches, as their sizes differ widely between
    // microarchitectures. Also, we currently do not have a use for L3 cache
    // size modeling yet.
  };

  /// \return The size of the cache level in bytes, if available.
Florian Hahn's avatar
Florian Hahn committed
907
  Optional<unsigned> getCacheSize(CacheLevel Level) const;
908
909

  /// \return The associativity of the cache level, if available.
Florian Hahn's avatar
Florian Hahn committed
910
  Optional<unsigned> getCacheAssociativity(CacheLevel Level) const;
911

912
913
914
  /// \return How much before a load we should place the prefetch
  /// instruction.  This is currently measured in number of
  /// instructions.
915
916
  unsigned getPrefetchDistance() const;

917
918
919
920
921
922
  /// Some HW prefetchers can handle accesses up to a certain constant stride.
  /// Sometimes prefetching is beneficial even below the HW prefetcher limit,
  /// and the arguments provided are meant to serve as a basis for deciding this
  /// for a particular loop.
  ///
  /// \param NumMemAccesses        Number of memory accesses in the loop.
923
  /// \param NumStridedMemAccesses Number of the memory accesses that
924
925
926
927
928
929
930
931
932
933
  ///                              ScalarEvolution could find a known stride
  ///                              for.
  /// \param NumPrefetches         Number of software prefetches that will be
  ///                              emitted as determined by the addresses
  ///                              involved and the cache line size.
  /// \param HasCall               True if the loop contains a call.
  ///
  /// \return This is the minimum stride in bytes where it makes sense to start
  ///         adding SW prefetches. The default is 1, i.e. prefetch with any
  ///         stride.
934
935
  unsigned getMinPrefetchStride(unsigned NumMemAccesses,
                                unsigned NumStridedMemAccesses,
936
                                unsigned NumPrefetches, bool HasCall) const;
937

938
939
940
  /// \return The maximum number of iterations to prefetch ahead.  If
  /// the required number of iterations is more than this number, no
  /// prefetching is performed.
941
942
  unsigned getMaxPrefetchIterationsAhead() const;

943
944
945
  /// \return True if prefetching should also be done for writes.
  bool enableWritePrefetching() const;

946
  /// \return The maximum interleave factor that any transform should try to
947
948
  /// perform for this target. This number depends on the level of parallelism
  /// and the number of execution units in the CPU.
949
  unsigned getMaxInterleaveFactor(unsigned VF) const;
950

951
952
953
  /// Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
  static OperandValueKind getOperandInfo(Value *V,
                                         OperandValueProperties &OpProps);
954

955
956
957
958
959
960
961
962
963
964
965
966
967
968
  /// This is an approximation of reciprocal throughput of a math/logic op.
  /// A higher cost indicates less expected throughput.
  /// From Agner Fog's guides, reciprocal throughput is "the average number of
  /// clock cycles per instruction when the instructions are not part of a
  /// limiting dependency chain."
  /// Therefore, costs should be scaled to account for multiple execution units
  /// on the target that can process this type of instruction. For example, if
  /// there are 5 scalar integer units and 2 vector integer units that can
  /// calculate an 'add' in a single cycle, this model should indicate that the
  /// cost of the vector add instruction is 2.5 times the cost of the scalar
  /// add instruction.
  /// \p Args is an optional argument which holds the instruction operands
  /// values so the TTI can analyze those values searching for special
  /// cases or optimizations based on those values.
969
970
  /// \p CxtI is the optional original context instruction, if one exists, to
  /// provide even more information.
971
  int getArithmeticInstrCost(
972
973
974
      unsigned Opcode, Type *Ty,
      TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
      OperandValueKind Opd1Info = OK_AnyValue,
975
976
      OperandValueKind Opd2Info = OK_AnyValue,
      OperandValueProperties Opd1PropInfo = OP_None,
977
      OperandValueProperties Opd2PropInfo = OP_None,
978
979
      ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
      const Instruction *CxtI = nullptr) const;
Nadav Rotem's avatar
   
Nadav Rotem committed
980

Nadav Rotem's avatar
Nadav Rotem committed
981
  /// \return The cost of a shuffle instruction of kind Kind and of type Tp.
982
  /// The index and subtype parameters are used by the subvector insertion and
983
  /// extraction shuffle kinds to show the insert/extract point and the type of
984
  /// the subvector being inserted/extracted.
985
  /// NOTE: For subvector extractions Tp represents the source type.
986
987
  int getShuffleCost(ShuffleKind Kind, VectorType *Tp, int Index = 0,
                     VectorType *SubTp = nullptr) const;
988

Nadav Rotem's avatar
Nadav Rotem committed
989
  /// \return The expected cost of cast instructions, such as bitcast, trunc,
990
991
992
  /// zext, etc. If there is an existing instruction that holds Opcode, it
  /// may be passed in the 'I' parameter.
  int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
993
                       TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency,
994
                       const Instruction *I = nullptr) const;
Nadav Rotem's avatar
   
Nadav Rotem committed
995

996
997
998
999
1000
  /// \return The expected cost of a sign- or zero-extended vector extract. Use
  /// -1 to indicate that there is no information about the index value.
  int getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy,
                               unsigned Index = -1) const;

For faster browsing, not all history is shown. View entire blame