ARMTargetTransformInfo.cpp 57.5 KB
Newer Older
1
//===- ARMTargetTransformInfo.cpp - ARM specific TTI ----------------------===//
2
//
3
4
5
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
7
8
//
//===----------------------------------------------------------------------===//

9
#include "ARMTargetTransformInfo.h"
10
11
12
13
14
#include "ARMSubtarget.h"
#include "MCTargetDesc/ARMAddressingModes.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/Analysis/LoopInfo.h"
15
#include "llvm/CodeGen/CostTable.h"
16
#include "llvm/CodeGen/ISDOpcodes.h"
17
#include "llvm/CodeGen/ValueTypes.h"
18
19
20
21
22
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
23
#include "llvm/IR/IntrinsicInst.h"
24
#include "llvm/IR/IntrinsicsARM.h"
25
#include "llvm/IR/PatternMatch.h"
26
27
28
#include "llvm/IR/Type.h"
#include "llvm/MC/SubtargetFeature.h"
#include "llvm/Support/Casting.h"
29
#include "llvm/Support/MachineValueType.h"
30
31
32
33
34
35
#include "llvm/Target/TargetMachine.h"
#include <algorithm>
#include <cassert>
#include <cstdint>
#include <utility>

36
37
using namespace llvm;

38
39
#define DEBUG_TYPE "armtti"

David Green's avatar
David Green committed
40
static cl::opt<bool> EnableMaskedLoadStores(
41
  "enable-arm-maskedldst", cl::Hidden, cl::init(true),
David Green's avatar
David Green committed
42
43
  cl::desc("Enable the generation of masked loads and stores"));

44
static cl::opt<bool> DisableLowOverheadLoops(
45
  "disable-arm-loloops", cl::Hidden, cl::init(false),
46
47
  cl::desc("Disable the generation of low-overhead loops"));

48
49
extern cl::opt<bool> DisableTailPredication;

50
51
extern cl::opt<bool> EnableMaskedGatherScatters;

52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
bool ARMTTIImpl::areInlineCompatible(const Function *Caller,
                                     const Function *Callee) const {
  const TargetMachine &TM = getTLI()->getTargetMachine();
  const FeatureBitset &CallerBits =
      TM.getSubtargetImpl(*Caller)->getFeatureBits();
  const FeatureBitset &CalleeBits =
      TM.getSubtargetImpl(*Callee)->getFeatureBits();

  // To inline a callee, all features not in the whitelist must match exactly.
  bool MatchExact = (CallerBits & ~InlineFeatureWhitelist) ==
                    (CalleeBits & ~InlineFeatureWhitelist);
  // For features in the whitelist, the callee's features must be a subset of
  // the callers'.
  bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeatureWhitelist) ==
                     (CalleeBits & InlineFeatureWhitelist);
  return MatchExact && MatchSubset;
}

70
71
72
73
74
75
76
77
78
79
80
81
82
83
bool ARMTTIImpl::shouldFavorBackedgeIndex(const Loop *L) const {
  if (L->getHeader()->getParent()->hasOptSize())
    return false;
  if (ST->hasMVEIntegerOps())
    return false;
  return ST->isMClass() && ST->isThumb2() && L->getNumBlocks() == 1;
}

bool ARMTTIImpl::shouldFavorPostInc() const {
  if (ST->hasMVEIntegerOps())
    return true;
  return false;
}

84
85
int ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
                              TTI::TargetCostKind CostKind) {
86
87
  assert(Ty->isIntegerTy());

88
 unsigned Bits = Ty->getPrimitiveSizeInBits();
89
 if (Bits == 0 || Imm.getActiveBits() >= 64)
90
   return 4;
91

92
93
  int64_t SImmVal = Imm.getSExtValue();
  uint64_t ZImmVal = Imm.getZExtValue();
94
95
96
97
98
99
  if (!ST->isThumb()) {
    if ((SImmVal >= 0 && SImmVal < 65536) ||
        (ARM_AM::getSOImmVal(ZImmVal) != -1) ||
        (ARM_AM::getSOImmVal(~ZImmVal) != -1))
      return 1;
    return ST->hasV6T2Ops() ? 2 : 3;
100
101
  }
  if (ST->isThumb2()) {
102
103
104
105
106
107
    if ((SImmVal >= 0 && SImmVal < 65536) ||
        (ARM_AM::getT2SOImmVal(ZImmVal) != -1) ||
        (ARM_AM::getT2SOImmVal(~ZImmVal) != -1))
      return 1;
    return ST->hasV6T2Ops() ? 2 : 3;
  }
108
109
  // Thumb1, any i8 imm cost 1.
  if (Bits == 8 || (SImmVal >= 0 && SImmVal < 256))
110
    return 1;
111
  if ((~SImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal))
112
113
114
    return 2;
  // Load from constantpool.
  return 3;
115
}
Renato Golin's avatar
Renato Golin committed
116

117
118
119
120
121
122
123
124
125
126
// Constants smaller than 256 fit in the immediate field of
// Thumb1 instructions so we return a zero cost and 1 otherwise.
int ARMTTIImpl::getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx,
                                      const APInt &Imm, Type *Ty) {
  if (Imm.isNonNegative() && Imm.getLimitedValue() < 256)
    return 0;

  return 1;
}

127
int ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm,
128
                                  Type *Ty, TTI::TargetCostKind CostKind) {
129
130
131
132
133
134
135
136
137
  // Division by a constant can be turned into multiplication, but only if we
  // know it's constant. So it's not so much that the immediate is cheap (it's
  // not), but that the alternative is worse.
  // FIXME: this is probably unneeded with GlobalISel.
  if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
       Opcode == Instruction::SRem || Opcode == Instruction::URem) &&
      Idx == 1)
    return 0;

138
139
140
141
142
  if (Opcode == Instruction::And) {
    // UXTB/UXTH
    if (Imm == 255 || Imm == 65535)
      return 0;
    // Conversion to BIC is free, and means we can use ~Imm instead.
143
144
    return std::min(getIntImmCost(Imm, Ty, CostKind),
                    getIntImmCost(~Imm, Ty, CostKind));
145
  }
146

147
148
  if (Opcode == Instruction::Add)
    // Conversion to SUB is free, and means we can use -Imm instead.
149
150
    return std::min(getIntImmCost(Imm, Ty, CostKind),
                    getIntImmCost(-Imm, Ty, CostKind));
151

152
153
154
155
156
157
158
159
160
161
162
  if (Opcode == Instruction::ICmp && Imm.isNegative() &&
      Ty->getIntegerBitWidth() == 32) {
    int64_t NegImm = -Imm.getSExtValue();
    if (ST->isThumb2() && NegImm < 1<<12)
      // icmp X, #-C -> cmn X, #C
      return 0;
    if (ST->isThumb() && NegImm < 1<<8)
      // icmp X, #-C -> adds X, #C
      return 0;
  }

163
  // xor a, -1 can always be folded to MVN
164
165
  if (Opcode == Instruction::Xor && Imm.isAllOnesValue())
    return 0;
166

167
  return getIntImmCost(Imm, Ty, CostKind);
168
169
}

170
int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
171
                                 TTI::TargetCostKind CostKind,
172
                                 const Instruction *I) {
Renato Golin's avatar
Renato Golin committed
173
174
175
  int ISD = TLI->InstructionOpcodeToISD(Opcode);
  assert(ISD && "Invalid opcode");

176
177
178
179
180
181
182
  // TODO: Allow non-throughput costs that aren't binary.
  auto AdjustCost = [&CostKind](int Cost) {
    if (CostKind != TTI::TCK_RecipThroughput)
      return Cost == 0 ? 0 : 1;
    return Cost;
  };

183
  // Single to/from double precision conversions.
184
  static const CostTblEntry NEONFltDblTbl[] = {
185
186
187
188
189
190
191
192
    // Vector fptrunc/fpext conversions.
    { ISD::FP_ROUND,   MVT::v2f64, 2 },
    { ISD::FP_EXTEND,  MVT::v2f32, 2 },
    { ISD::FP_EXTEND,  MVT::v4f32, 4 }
  };

  if (Src->isVectorTy() && ST->hasNEON() && (ISD == ISD::FP_ROUND ||
                                          ISD == ISD::FP_EXTEND)) {
193
    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
194
    if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second))
195
      return AdjustCost(LT.first * Entry->Cost);
196
197
  }

198
199
  EVT SrcTy = TLI->getValueType(DL, Src);
  EVT DstTy = TLI->getValueType(DL, Dst);
Renato Golin's avatar
Renato Golin committed
200
201

  if (!SrcTy.isSimple() || !DstTy.isSimple())
202
    return AdjustCost(BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I));
Renato Golin's avatar
Renato Golin committed
203

David Green's avatar
David Green committed
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
  // The extend of a load is free
  if (I && isa<LoadInst>(I->getOperand(0))) {
    static const TypeConversionCostTblEntry LoadConversionTbl[] = {
        {ISD::SIGN_EXTEND, MVT::i32, MVT::i16, 0},
        {ISD::ZERO_EXTEND, MVT::i32, MVT::i16, 0},
        {ISD::SIGN_EXTEND, MVT::i32, MVT::i8, 0},
        {ISD::ZERO_EXTEND, MVT::i32, MVT::i8, 0},
        {ISD::SIGN_EXTEND, MVT::i16, MVT::i8, 0},
        {ISD::ZERO_EXTEND, MVT::i16, MVT::i8, 0},
        {ISD::SIGN_EXTEND, MVT::i64, MVT::i32, 1},
        {ISD::ZERO_EXTEND, MVT::i64, MVT::i32, 1},
        {ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 1},
        {ISD::ZERO_EXTEND, MVT::i64, MVT::i16, 1},
        {ISD::SIGN_EXTEND, MVT::i64, MVT::i8, 1},
        {ISD::ZERO_EXTEND, MVT::i64, MVT::i8, 1},
    };
    if (const auto *Entry = ConvertCostTableLookup(
            LoadConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
222
      return AdjustCost(Entry->Cost);
David Green's avatar
David Green committed
223
224
225
226
227
228
229
230
231
232
233
234
235

    static const TypeConversionCostTblEntry MVELoadConversionTbl[] = {
        {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0},
        {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0},
        {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 0},
        {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 0},
        {ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 0},
        {ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 0},
    };
    if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
      if (const auto *Entry =
              ConvertCostTableLookup(MVELoadConversionTbl, ISD,
                                     DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
236
        return AdjustCost(Entry->Cost);
David Green's avatar
David Green committed
237
    }
David Green's avatar
David Green committed
238
239
  }

240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
  // NEON vector operations that can extend their inputs.
  if ((ISD == ISD::SIGN_EXTEND || ISD == ISD::ZERO_EXTEND) &&
      I && I->hasOneUse() && ST->hasNEON() && SrcTy.isVector()) {
    static const TypeConversionCostTblEntry NEONDoubleWidthTbl[] = {
      // vaddl
      { ISD::ADD, MVT::v4i32, MVT::v4i16, 0 },
      { ISD::ADD, MVT::v8i16, MVT::v8i8,  0 },
      // vsubl
      { ISD::SUB, MVT::v4i32, MVT::v4i16, 0 },
      { ISD::SUB, MVT::v8i16, MVT::v8i8,  0 },
      // vmull
      { ISD::MUL, MVT::v4i32, MVT::v4i16, 0 },
      { ISD::MUL, MVT::v8i16, MVT::v8i8,  0 },
      // vshll
      { ISD::SHL, MVT::v4i32, MVT::v4i16, 0 },
      { ISD::SHL, MVT::v8i16, MVT::v8i8,  0 },
    };

    auto *User = cast<Instruction>(*I->user_begin());
    int UserISD = TLI->InstructionOpcodeToISD(User->getOpcode());
    if (auto *Entry = ConvertCostTableLookup(NEONDoubleWidthTbl, UserISD,
                                             DstTy.getSimpleVT(),
                                             SrcTy.getSimpleVT())) {
263
      return AdjustCost(Entry->Cost);
264
265
266
    }
  }

Renato Golin's avatar
Renato Golin committed
267
  // Some arithmetic, load and store operations have specific instructions
268
269
  // to cast up/down their types automatically at no extra cost.
  // TODO: Get these tables to know at least what the related operations are.
270
  static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = {
271
272
    { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
    { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
Renato Golin's avatar
Renato Golin committed
273
274
275
276
    { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
    { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
    { ISD::TRUNCATE,    MVT::v4i32, MVT::v4i64, 0 },
    { ISD::TRUNCATE,    MVT::v4i16, MVT::v4i32, 1 },
277

278
    // The number of vmovl instructions for the extension.
279
280
281
282
283
284
285
286
    { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8,  1 },
    { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8,  1 },
    { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8,  2 },
    { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8,  2 },
    { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8,  3 },
    { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8,  3 },
    { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
    { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
287
288
289
290
291
292
293
294
295
296
297
    { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
    { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
    { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
    { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
    { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
    { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
    { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
    { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },

298
299
300
    // Operations that we legalize using splitting.
    { ISD::TRUNCATE,    MVT::v16i8, MVT::v16i32, 6 },
    { ISD::TRUNCATE,    MVT::v8i8, MVT::v8i32, 3 },
301

302
303
304
    // Vector float <-> i32 conversions.
    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i32, 1 },
    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i32, 1 },
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326

    { ISD::SINT_TO_FP,  MVT::v2f32, MVT::v2i8, 3 },
    { ISD::UINT_TO_FP,  MVT::v2f32, MVT::v2i8, 3 },
    { ISD::SINT_TO_FP,  MVT::v2f32, MVT::v2i16, 2 },
    { ISD::UINT_TO_FP,  MVT::v2f32, MVT::v2i16, 2 },
    { ISD::SINT_TO_FP,  MVT::v2f32, MVT::v2i32, 1 },
    { ISD::UINT_TO_FP,  MVT::v2f32, MVT::v2i32, 1 },
    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i1, 3 },
    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i1, 3 },
    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i8, 3 },
    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i8, 3 },
    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i16, 2 },
    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i16, 2 },
    { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i16, 4 },
    { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i16, 4 },
    { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i32, 2 },
    { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i32, 2 },
    { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i16, 8 },
    { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i16, 8 },
    { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i32, 4 },
    { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i32, 4 },

327
328
    { ISD::FP_TO_SINT,  MVT::v4i32, MVT::v4f32, 1 },
    { ISD::FP_TO_UINT,  MVT::v4i32, MVT::v4f32, 1 },
329
330
331
332
    { ISD::FP_TO_SINT,  MVT::v4i8, MVT::v4f32, 3 },
    { ISD::FP_TO_UINT,  MVT::v4i8, MVT::v4f32, 3 },
    { ISD::FP_TO_SINT,  MVT::v4i16, MVT::v4f32, 2 },
    { ISD::FP_TO_UINT,  MVT::v4i16, MVT::v4f32, 2 },
333
334
335
336

    // Vector double <-> i32 conversions.
    { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
    { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
337
338
339
340
341
342
343
344

    { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i8, 4 },
    { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i8, 4 },
    { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i16, 3 },
    { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i16, 3 },
    { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
    { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },

345
    { ISD::FP_TO_SINT,  MVT::v2i32, MVT::v2f64, 2 },
346
347
348
349
350
    { ISD::FP_TO_UINT,  MVT::v2i32, MVT::v2f64, 2 },
    { ISD::FP_TO_SINT,  MVT::v8i16, MVT::v8f32, 4 },
    { ISD::FP_TO_UINT,  MVT::v8i16, MVT::v8f32, 4 },
    { ISD::FP_TO_SINT,  MVT::v16i16, MVT::v16f32, 8 },
    { ISD::FP_TO_UINT,  MVT::v16i16, MVT::v16f32, 8 }
Renato Golin's avatar
Renato Golin committed
351
352
  };

353
  if (SrcTy.isVector() && ST->hasNEON()) {
354
355
356
    if (const auto *Entry = ConvertCostTableLookup(NEONVectorConversionTbl, ISD,
                                                   DstTy.getSimpleVT(),
                                                   SrcTy.getSimpleVT()))
357
      return AdjustCost(Entry->Cost);
358
359
360
  }

  // Scalar float to integer conversions.
361
  static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = {
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
    { ISD::FP_TO_SINT,  MVT::i1, MVT::f32, 2 },
    { ISD::FP_TO_UINT,  MVT::i1, MVT::f32, 2 },
    { ISD::FP_TO_SINT,  MVT::i1, MVT::f64, 2 },
    { ISD::FP_TO_UINT,  MVT::i1, MVT::f64, 2 },
    { ISD::FP_TO_SINT,  MVT::i8, MVT::f32, 2 },
    { ISD::FP_TO_UINT,  MVT::i8, MVT::f32, 2 },
    { ISD::FP_TO_SINT,  MVT::i8, MVT::f64, 2 },
    { ISD::FP_TO_UINT,  MVT::i8, MVT::f64, 2 },
    { ISD::FP_TO_SINT,  MVT::i16, MVT::f32, 2 },
    { ISD::FP_TO_UINT,  MVT::i16, MVT::f32, 2 },
    { ISD::FP_TO_SINT,  MVT::i16, MVT::f64, 2 },
    { ISD::FP_TO_UINT,  MVT::i16, MVT::f64, 2 },
    { ISD::FP_TO_SINT,  MVT::i32, MVT::f32, 2 },
    { ISD::FP_TO_UINT,  MVT::i32, MVT::f32, 2 },
    { ISD::FP_TO_SINT,  MVT::i32, MVT::f64, 2 },
    { ISD::FP_TO_UINT,  MVT::i32, MVT::f64, 2 },
    { ISD::FP_TO_SINT,  MVT::i64, MVT::f32, 10 },
    { ISD::FP_TO_UINT,  MVT::i64, MVT::f32, 10 },
    { ISD::FP_TO_SINT,  MVT::i64, MVT::f64, 10 },
    { ISD::FP_TO_UINT,  MVT::i64, MVT::f64, 10 }
  };
  if (SrcTy.isFloatingPoint() && ST->hasNEON()) {
384
385
386
    if (const auto *Entry = ConvertCostTableLookup(NEONFloatConversionTbl, ISD,
                                                   DstTy.getSimpleVT(),
                                                   SrcTy.getSimpleVT()))
387
      return AdjustCost(Entry->Cost);
388
389
390
  }

  // Scalar integer to float conversions.
391
  static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = {
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
    { ISD::SINT_TO_FP,  MVT::f32, MVT::i1, 2 },
    { ISD::UINT_TO_FP,  MVT::f32, MVT::i1, 2 },
    { ISD::SINT_TO_FP,  MVT::f64, MVT::i1, 2 },
    { ISD::UINT_TO_FP,  MVT::f64, MVT::i1, 2 },
    { ISD::SINT_TO_FP,  MVT::f32, MVT::i8, 2 },
    { ISD::UINT_TO_FP,  MVT::f32, MVT::i8, 2 },
    { ISD::SINT_TO_FP,  MVT::f64, MVT::i8, 2 },
    { ISD::UINT_TO_FP,  MVT::f64, MVT::i8, 2 },
    { ISD::SINT_TO_FP,  MVT::f32, MVT::i16, 2 },
    { ISD::UINT_TO_FP,  MVT::f32, MVT::i16, 2 },
    { ISD::SINT_TO_FP,  MVT::f64, MVT::i16, 2 },
    { ISD::UINT_TO_FP,  MVT::f64, MVT::i16, 2 },
    { ISD::SINT_TO_FP,  MVT::f32, MVT::i32, 2 },
    { ISD::UINT_TO_FP,  MVT::f32, MVT::i32, 2 },
    { ISD::SINT_TO_FP,  MVT::f64, MVT::i32, 2 },
    { ISD::UINT_TO_FP,  MVT::f64, MVT::i32, 2 },
    { ISD::SINT_TO_FP,  MVT::f32, MVT::i64, 10 },
    { ISD::UINT_TO_FP,  MVT::f32, MVT::i64, 10 },
    { ISD::SINT_TO_FP,  MVT::f64, MVT::i64, 10 },
    { ISD::UINT_TO_FP,  MVT::f64, MVT::i64, 10 }
  };

  if (SrcTy.isInteger() && ST->hasNEON()) {
415
416
417
    if (const auto *Entry = ConvertCostTableLookup(NEONIntegerConversionTbl,
                                                   ISD, DstTy.getSimpleVT(),
                                                   SrcTy.getSimpleVT()))
418
      return AdjustCost(Entry->Cost);
Renato Golin's avatar
Renato Golin committed
419
420
  }

David Green's avatar
David Green committed
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
  // MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one
  // instruction, i8->i32 is two. i64 zexts are an VAND with a constant, sext
  // are linearised so take more.
  static const TypeConversionCostTblEntry MVEVectorConversionTbl[] = {
    { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
    { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
    { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
    { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
    { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 10 },
    { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 2 },
    { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
    { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
    { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 10 },
    { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
    { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 8 },
    { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 2 },
  };

  if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
    if (const auto *Entry = ConvertCostTableLookup(MVEVectorConversionTbl,
                                                   ISD, DstTy.getSimpleVT(),
                                                   SrcTy.getSimpleVT()))
443
      return AdjustCost(Entry->Cost * ST->getMVEVectorCostFactor());
David Green's avatar
David Green committed
444
445
  }

446
  // Scalar integer conversion costs.
447
  static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = {
448
449
450
451
452
453
454
455
456
457
458
    // i16 -> i64 requires two dependent operations.
    { ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 2 },

    // Truncates on i64 are assumed to be free.
    { ISD::TRUNCATE,    MVT::i32, MVT::i64, 0 },
    { ISD::TRUNCATE,    MVT::i16, MVT::i64, 0 },
    { ISD::TRUNCATE,    MVT::i8,  MVT::i64, 0 },
    { ISD::TRUNCATE,    MVT::i1,  MVT::i64, 0 }
  };

  if (SrcTy.isInteger()) {
459
460
461
    if (const auto *Entry = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD,
                                                   DstTy.getSimpleVT(),
                                                   SrcTy.getSimpleVT()))
462
      return AdjustCost(Entry->Cost);
463
464
  }

465
466
467
  int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
                     ? ST->getMVEVectorCostFactor()
                     : 1;
468
469
  return AdjustCost(
    BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I));
Renato Golin's avatar
Renato Golin committed
470
}
471

472
473
int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
                                   unsigned Index) {
474
475
  // Penalize inserting into an D-subregister. We end up with a three times
  // lower estimated throughput on swift.
476
477
  if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement &&
      ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32)
478
    return 3;
479

480
481
  if (ST->hasNEON() && (Opcode == Instruction::InsertElement ||
                        Opcode == Instruction::ExtractElement)) {
482
483
    // Cross-class copies are expensive on many microarchitectures,
    // so assume they are expensive by default.
484
    if (cast<VectorType>(ValTy)->getElementType()->isIntegerTy())
485
486
487
488
489
490
491
492
      return 3;

    // Even if it's not a cross class copy, this likely leads to mixing
    // of NEON and VFP code and should be therefore penalized.
    if (ValTy->isVectorTy() &&
        ValTy->getScalarSizeInBits() <= 32)
      return std::max(BaseT::getVectorInstrCost(Opcode, ValTy, Index), 2U);
  }
493

494
495
496
497
498
499
500
501
  if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement ||
                                 Opcode == Instruction::ExtractElement)) {
    // We say MVE moves costs at least the MVEVectorCostFactor, even though
    // they are scalar instructions. This helps prevent mixing scalar and
    // vector, to prevent vectorising where we end up just scalarising the
    // result anyway.
    return std::max(BaseT::getVectorInstrCost(Opcode, ValTy, Index),
                    ST->getMVEVectorCostFactor()) *
502
           cast<FixedVectorType>(ValTy)->getNumElements() / 2;
503
504
  }

505
  return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
506
}
507

508
int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
509
                                   TTI::TargetCostKind CostKind,
510
                                   const Instruction *I) {
511
  int ISD = TLI->InstructionOpcodeToISD(Opcode);
512
  // On NEON a vector select gets lowered to vbsl.
513
  if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT) {
514
    // Lowering of some vector selects is currently far from perfect.
515
    static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = {
516
517
518
519
520
      { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 },
      { ISD::SELECT, MVT::v8i1, MVT::v8i64, 50 },
      { ISD::SELECT, MVT::v16i1, MVT::v16i64, 100 }
    };

521
522
    EVT SelCondTy = TLI->getValueType(DL, CondTy);
    EVT SelValTy = TLI->getValueType(DL, ValTy);
523
    if (SelCondTy.isSimple() && SelValTy.isSimple()) {
524
525
526
527
      if (const auto *Entry = ConvertCostTableLookup(NEONVectorSelectTbl, ISD,
                                                     SelCondTy.getSimpleVT(),
                                                     SelValTy.getSimpleVT()))
        return Entry->Cost;
528
    }
529

530
    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
531
532
533
    return LT.first;
  }

534
535
536
  int BaseCost = ST->hasMVEIntegerOps() && ValTy->isVectorTy()
                     ? ST->getMVEVectorCostFactor()
                     : 1;
537
538
  return BaseCost * BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind,
                                              I);
539
}
540

541
542
int ARMTTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
                                          const SCEV *Ptr) {
543
544
545
546
547
  // Address computations in vectorized code with non-consecutive addresses will
  // likely result in more instructions compared to scalar code where the
  // computation can more often be merged into the index mode. The resulting
  // extra micro-ops can significantly decrease throughput.
  unsigned NumVectorInstToHideOverhead = 10;
548
  int MaxMergeDistance = 64;
549

550
551
552
553
  if (ST->hasNEON()) {
    if (Ty->isVectorTy() && SE &&
        !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
      return NumVectorInstToHideOverhead;
554

555
556
557
558
559
    // In many cases the address computation is not merged into the instruction
    // addressing mode.
    return 1;
  }
  return BaseT::getAddressComputationCost(Ty, SE, Ptr);
560
}
561

562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
bool ARMTTIImpl::isProfitableLSRChainElement(Instruction *I) {
  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
    // If a VCTP is part of a chain, it's already profitable and shouldn't be
    // optimized, else LSR may block tail-predication.
    switch (II->getIntrinsicID()) {
    case Intrinsic::arm_mve_vctp8:
    case Intrinsic::arm_mve_vctp16:
    case Intrinsic::arm_mve_vctp32:
    case Intrinsic::arm_mve_vctp64:
      return true;
    default:
      break;
    }
  }
  return false;
}

579
bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, MaybeAlign Alignment) {
David Green's avatar
David Green committed
580
581
582
  if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps())
    return false;

583
  if (auto *VecTy = dyn_cast<FixedVectorType>(DataTy)) {
584
585
586
587
588
589
590
    // Don't support v2i1 yet.
    if (VecTy->getNumElements() == 2)
      return false;

    // We don't support extending fp types.
     unsigned VecWidth = DataTy->getPrimitiveSizeInBits();
    if (VecWidth != 128 && VecTy->getElementType()->isFloatingPointTy())
David Green's avatar
David Green committed
591
592
593
594
      return false;
  }

  unsigned EltWidth = DataTy->getScalarSizeInBits();
595
596
  return (EltWidth == 32 && (!Alignment || *Alignment >= 4)) ||
         (EltWidth == 16 && (!Alignment || *Alignment >= 2)) ||
597
         (EltWidth == 8);
David Green's avatar
David Green committed
598
599
}

600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, MaybeAlign Alignment) {
  if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps())
    return false;

  // This method is called in 2 places:
  //  - from the vectorizer with a scalar type, in which case we need to get
  //  this as good as we can with the limited info we have (and rely on the cost
  //  model for the rest).
  //  - from the masked intrinsic lowering pass with the actual vector type.
  // For MVE, we have a custom lowering pass that will already have custom
  // legalised any gathers that we can to MVE intrinsics, and want to expand all
  // the rest. The pass runs before the masked intrinsic lowering pass, so if we
  // are here, we know we want to expand.
  if (isa<VectorType>(Ty))
    return false;

  unsigned EltWidth = Ty->getScalarSizeInBits();
617
618
  return ((EltWidth == 32 && (!Alignment || *Alignment >= 4)) ||
          (EltWidth == 16 && (!Alignment || *Alignment >= 2)) || EltWidth == 8);
619
620
}

621
622
623
624
625
626
627
628
629
630
631
632
633
634
int ARMTTIImpl::getMemcpyCost(const Instruction *I) {
  const MemCpyInst *MI = dyn_cast<MemCpyInst>(I);
  assert(MI && "MemcpyInst expected");
  ConstantInt *C = dyn_cast<ConstantInt>(MI->getLength());

  // To model the cost of a library call, we assume 1 for the call, and
  // 3 for the argument setup.
  const unsigned LibCallCost = 4;

  // If 'size' is not a constant, a library call will be generated.
  if (!C)
    return LibCallCost;

  const unsigned Size = C->getValue().getZExtValue();
635
636
  const Align DstAlign = *MI->getDestAlign();
  const Align SrcAlign = *MI->getSourceAlign();
637
638
639
640
641
642
643
644
  const Function *F = I->getParent()->getParent();
  const unsigned Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize());
  std::vector<EVT> MemOps;

  // MemOps will be poplulated with a list of data types that needs to be
  // loaded and stored. That's why we multiply the number of elements by 2 to
  // get the cost for this memcpy.
  if (getTLI()->findOptimalMemOpLowering(
645
646
647
          MemOps, Limit,
          MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign,
                      /*IsVolatile*/ true),
648
649
650
651
652
653
654
655
          MI->getDestAddressSpace(), MI->getSourceAddressSpace(),
          F->getAttributes()))
    return MemOps.size() * 2;

  // If we can't find an optimal memop lowering, return the default cost
  return LibCallCost;
}

656
657
int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
                               int Index, VectorType *SubTp) {
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
  if (ST->hasNEON()) {
    if (Kind == TTI::SK_Broadcast) {
      static const CostTblEntry NEONDupTbl[] = {
          // VDUP handles these cases.
          {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
          {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
          {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
          {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
          {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
          {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},

          {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
          {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
          {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
          {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}};

      std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);

      if (const auto *Entry =
              CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, LT.second))
        return LT.first * Entry->Cost;
    }
    if (Kind == TTI::SK_Reverse) {
      static const CostTblEntry NEONShuffleTbl[] = {
          // Reverse shuffle cost one instruction if we are shuffling within a
          // double word (vrev) or two if we shuffle a quad word (vrev, vext).
          {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
          {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
          {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
          {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
          {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
          {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},

          {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
          {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
          {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2},
          {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}};

      std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);

      if (const auto *Entry =
              CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
        return LT.first * Entry->Cost;
    }
    if (Kind == TTI::SK_Select) {
      static const CostTblEntry NEONSelShuffleTbl[] = {
          // Select shuffle cost table for ARM. Cost is the number of
          // instructions
          // required to create the shuffled vector.
707

708
709
710
711
          {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
          {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
          {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
          {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
712

713
714
715
          {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
          {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
          {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2},
716

717
          {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16},
718

719
          {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}};
720

721
722
723
724
725
      std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
      if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl,
                                              ISD::VECTOR_SHUFFLE, LT.second))
        return LT.first * Entry->Cost;
    }
726
  }
727
728
729
730
731
732
733
734
735
736
737
738
739
740
  if (ST->hasMVEIntegerOps()) {
    if (Kind == TTI::SK_Broadcast) {
      static const CostTblEntry MVEDupTbl[] = {
          // VDUP handles these cases.
          {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
          {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
          {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1},
          {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
          {ISD::VECTOR_SHUFFLE, MVT::v8f16, 1}};

      std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);

      if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE,
                                              LT.second))
741
        return LT.first * Entry->Cost * ST->getMVEVectorCostFactor();
742
743
    }
  }
744
745
746
747
  int BaseCost = ST->hasMVEIntegerOps() && Tp->isVectorTy()
                     ? ST->getMVEVectorCostFactor()
                     : 1;
  return BaseCost * BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
748
}
749

750
int ARMTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
751
                                       TTI::TargetCostKind CostKind,
752
753
754
755
756
757
                                       TTI::OperandValueKind Op1Info,
                                       TTI::OperandValueKind Op2Info,
                                       TTI::OperandValueProperties Opd1PropInfo,
                                       TTI::OperandValueProperties Opd2PropInfo,
                                       ArrayRef<const Value *> Args,
                                       const Instruction *CxtI) {
758
  int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
759
  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
760

761
  if (ST->hasNEON()) {
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
    const unsigned FunctionCallDivCost = 20;
    const unsigned ReciprocalDivCost = 10;
    static const CostTblEntry CostTbl[] = {
      // Division.
      // These costs are somewhat random. Choose a cost of 20 to indicate that
      // vectorizing devision (added function call) is going to be very expensive.
      // Double registers types.
      { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost},
      { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost},
      { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost},
      { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost},
      { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost},
      { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost},
      { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost},
      { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost},
      { ISD::SDIV, MVT::v4i16,     ReciprocalDivCost},
      { ISD::UDIV, MVT::v4i16,     ReciprocalDivCost},
      { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost},
      { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost},
      { ISD::SDIV, MVT::v8i8,      ReciprocalDivCost},
      { ISD::UDIV, MVT::v8i8,      ReciprocalDivCost},
      { ISD::SREM, MVT::v8i8,  8 * FunctionCallDivCost},
      { ISD::UREM, MVT::v8i8,  8 * FunctionCallDivCost},
      // Quad register types.
      { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost},
      { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost},
      { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost},
      { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost},
      { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost},
      { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost},
      { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost},
      { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost},
      { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost},
      { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost},
      { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost},
      { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost},
      { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost},
      { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost},
      { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost},
      { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost},
      // Multiplication.
    };

805
806
    if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second))
      return LT.first * Entry->Cost;
807

808
809
    int Cost = BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
                                             Op2Info,
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
                                             Opd1PropInfo, Opd2PropInfo);

    // This is somewhat of a hack. The problem that we are facing is that SROA
    // creates a sequence of shift, and, or instructions to construct values.
    // These sequences are recognized by the ISel and have zero-cost. Not so for
    // the vectorized code. Because we have support for v2i64 but not i64 those
    // sequences look particularly beneficial to vectorize.
    // To work around this we increase the cost of v2i64 operations to make them
    // seem less beneficial.
    if (LT.second == MVT::v2i64 &&
        Op2Info == TargetTransformInfo::OK_UniformConstantValue)
      Cost += 4;

    return Cost;
  }

826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
  // If this operation is a shift on arm/thumb2, it might well be folded into
  // the following instruction, hence having a cost of 0.
  auto LooksLikeAFreeShift = [&]() {
    if (ST->isThumb1Only() || Ty->isVectorTy())
      return false;

    if (!CxtI || !CxtI->hasOneUse() || !CxtI->isShift())
      return false;
    if (Op2Info != TargetTransformInfo::OK_UniformConstantValue)
      return false;

    // Folded into a ADC/ADD/AND/BIC/CMP/EOR/MVN/ORR/ORN/RSB/SBC/SUB
    switch (cast<Instruction>(CxtI->user_back())->getOpcode()) {
    case Instruction::Add:
    case Instruction::Sub:
    case Instruction::And:
    case Instruction::Xor:
    case Instruction::Or:
    case Instruction::ICmp:
      return true;
    default:
      return false;
    }
  };
  if (LooksLikeAFreeShift())
    return 0;

853
854
855
856
857
858
859
860
861
862
863
864
  int BaseCost = ST->hasMVEIntegerOps() && Ty->isVectorTy()
                     ? ST->getMVEVectorCostFactor()
                     : 1;

  // The rest of this mostly follows what is done in BaseT::getArithmeticInstrCost,
  // without treating floats as more expensive that scalars or increasing the
  // costs for custom operations. The results is also multiplied by the
  // MVEVectorCostFactor where appropriate.
  if (TLI->isOperationLegalOrCustomOrPromote(ISDOpcode, LT.second))
    return LT.first * BaseCost;

  // Else this is expand, assume that we need to scalarize this op.
865
  if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
866
    unsigned Num = VTy->getNumElements();
867
868
    unsigned Cost = getArithmeticInstrCost(Opcode, Ty->getScalarType(),
                                           CostKind);
869
870
    // Return the cost of multiple scalar invocation plus the cost of
    // inserting and extracting the values.
871
    return BaseT::getScalarizationOverhead(VTy, Args) + Num * Cost;
872
873
874
  }

  return BaseCost;
875
876
}

877
878
int ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
                                MaybeAlign Alignment, unsigned AddressSpace,
879
                                TTI::TargetCostKind CostKind,
880
                                const Instruction *I) {
Sam Parker's avatar
Sam Parker committed
881
882
883
884
  // TODO: Handle other cost kinds.
  if (CostKind != TTI::TCK_RecipThroughput)
    return 1;

885
  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
886

887
888
  if (ST->hasNEON() && Src->isVectorTy() &&
      (Alignment && *Alignment != Align(16)) &&
889
      cast<VectorType>(Src)->getElementType()->isDoubleTy()) {
890
891
892
893
    // Unaligned loads/stores are extremely inefficient.
    // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
    return LT.first * 4;
  }
894
895
896
897
  int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
                     ? ST->getMVEVectorCostFactor()
                     : 1;
  return BaseCost * LT.first;
898
}
899

900
901
int ARMTTIImpl::getInterleavedMemoryOpCost(
    unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
902
903
904
    unsigned Alignment, unsigned AddressSpace,
    TTI::TargetCostKind CostKind,
    bool UseMaskForCond, bool UseMaskForGaps) {
905
906
907
908
  assert(Factor >= 2 && "Invalid interleave factor");
  assert(isa<VectorType>(VecTy) && "Expect a vector type");

  // vldN/vstN doesn't support vector types of i64/f64 element.
909
  bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64;
910

911
  if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits &&
912
      !UseMaskForCond && !UseMaskForGaps) {
913
914
915
    unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements();
    auto *SubVecTy =
        FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor);
916
917

    // vldN/vstN only support legal vector types of size 64 or 128 in bits.
918
919
    // Accesses having vector types that are a multiple of 128 bits can be
    // matched to more than one vldN/vstN instruction.
920
    int BaseCost = ST->hasMVEIntegerOps() ? ST->getMVEVectorCostFactor() : 1;
921
    if (NumElts % Factor == 0 &&
922
923
924
925
926
927
928
929
930
931
932
        TLI->isLegalInterleavedAccessType(Factor, SubVecTy, DL))
      return Factor * BaseCost * TLI->getNumInterleavedAccesses(SubVecTy, DL);

    // Some smaller than legal interleaved patterns are cheap as we can make
    // use of the vmovn or vrev patterns to interleave a standard load. This is
    // true for v4i8, v8i8 and v4i16 at least (but not for v4f16 as it is
    // promoted differently). The cost of 2 here is then a load and vrev or
    // vmovn.
    if (ST->hasMVEIntegerOps() && Factor == 2 && NumElts / Factor > 2 &&
        VecTy->isIntOrIntVectorTy() && DL.getTypeSizeInBits(SubVecTy) <= 64)
      return 2 * BaseCost;
933
934
935
  }

  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
936
                                           Alignment, AddressSpace, CostKind,
937
                                           UseMaskForCond, UseMaskForGaps);
938
}
939

940
941
unsigned ARMTTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
                                            Value *Ptr, bool VariableMask,
942
                                            unsigned Alignment,
943
                                            TTI::TargetCostKind CostKind,
944
945
                                            const Instruction *I) {
  using namespace PatternMatch;
946
947
  if (!ST->hasMVEIntegerOps() || !EnableMaskedGatherScatters)
    return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
948
                                         Alignment, CostKind, I);
949
950

  assert(DataTy->isVectorTy() && "Can't do gather/scatters on scalar!");
951
  auto *VTy = cast<FixedVectorType>(DataTy);
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967

  // TODO: Splitting, once we do that.

  unsigned NumElems = VTy->getNumElements();
  unsigned EltSize = VTy->getScalarSizeInBits();
  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, DataTy);

  // For now, it is assumed that for the MVE gather instructions the loads are
  // all effectively serialised. This means the cost is the scalar cost
  // multiplied by the number of elements being loaded. This is possibly very
  // conservative, but even so we still end up vectorising loops because the
  // cost per iteration for many loops is lower than for scalar loops.
  unsigned VectorCost = NumElems * LT.first;
  // The scalarization cost should be a lot higher. We use the number of vector
  // elements plus the scalarization overhead.
  unsigned ScalarCost =
968
      NumElems * LT.first + BaseT::getScalarizationOverhead(VTy, {});
969
970
971
972

  if (Alignment < EltSize / 8)
    return ScalarCost;

973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
  unsigned ExtSize = EltSize;
  // Check whether there's a single user that asks for an extended type
  if (I != nullptr) {
    // Dependent of the caller of this function, a gather instruction will
    // either have opcode Instruction::Load or be a call to the masked_gather
    // intrinsic
    if ((I->getOpcode() == Instruction::Load ||
         match(I, m_Intrinsic<Intrinsic::masked_gather>())) &&
        I->hasOneUse()) {
      const User *Us = *I->users().begin();
      if (isa<ZExtInst>(Us) || isa<SExtInst>(Us)) {
        // only allow valid type combinations
        unsigned TypeSize =
            cast<Instruction>(Us)->getType()->getScalarSizeInBits();
        if (((TypeSize == 32 && (EltSize == 8 || EltSize == 16)) ||
             (TypeSize == 16 && EltSize == 8)) &&
            TypeSize * NumElems == 128) {
          ExtSize = TypeSize;
        }
      }
    }
    // Check whether the input data needs to be truncated
    TruncInst *T;
    if ((I->getOpcode() == Instruction::Store ||
         match(I, m_Intrinsic<Intrinsic::masked_scatter>())) &&
        (T = dyn_cast<TruncInst>(I->getOperand(0)))) {
      // Only allow valid type combinations
      unsigned TypeSize = T->getOperand(0)->getType()->getScalarSizeInBits();
For faster browsing, not all history is shown. View entire blame