ARMTargetTransformInfo.cpp 57.5 KB
Newer Older
1
//===- ARMTargetTransformInfo.cpp - ARM specific TTI ----------------------===//
2
//
3
4
5
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
7
8
//
//===----------------------------------------------------------------------===//

9
#include "ARMTargetTransformInfo.h"
10
11
12
13
14
#include "ARMSubtarget.h"
#include "MCTargetDesc/ARMAddressingModes.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/Analysis/LoopInfo.h"
15
#include "llvm/CodeGen/CostTable.h"
16
#include "llvm/CodeGen/ISDOpcodes.h"
17
#include "llvm/CodeGen/ValueTypes.h"
18
19
20
21
22
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
23
#include "llvm/IR/IntrinsicInst.h"
24
#include "llvm/IR/IntrinsicsARM.h"
25
#include "llvm/IR/PatternMatch.h"
26
27
28
#include "llvm/IR/Type.h"
#include "llvm/MC/SubtargetFeature.h"
#include "llvm/Support/Casting.h"
29
#include "llvm/Support/MachineValueType.h"
30
31
32
33
34
35
#include "llvm/Target/TargetMachine.h"
#include <algorithm>
#include <cassert>
#include <cstdint>
#include <utility>

36
37
using namespace llvm;

38
39
#define DEBUG_TYPE "armtti"

David Green's avatar
David Green committed
40
static cl::opt<bool> EnableMaskedLoadStores(
41
  "enable-arm-maskedldst", cl::Hidden, cl::init(true),
David Green's avatar
David Green committed
42
43
  cl::desc("Enable the generation of masked loads and stores"));

44
static cl::opt<bool> DisableLowOverheadLoops(
45
  "disable-arm-loloops", cl::Hidden, cl::init(false),
46
47
  cl::desc("Disable the generation of low-overhead loops"));

48
49
extern cl::opt<bool> DisableTailPredication;

50
51
extern cl::opt<bool> EnableMaskedGatherScatters;

52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
bool ARMTTIImpl::areInlineCompatible(const Function *Caller,
                                     const Function *Callee) const {
  const TargetMachine &TM = getTLI()->getTargetMachine();
  const FeatureBitset &CallerBits =
      TM.getSubtargetImpl(*Caller)->getFeatureBits();
  const FeatureBitset &CalleeBits =
      TM.getSubtargetImpl(*Callee)->getFeatureBits();

  // To inline a callee, all features not in the whitelist must match exactly.
  bool MatchExact = (CallerBits & ~InlineFeatureWhitelist) ==
                    (CalleeBits & ~InlineFeatureWhitelist);
  // For features in the whitelist, the callee's features must be a subset of
  // the callers'.
  bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeatureWhitelist) ==
                     (CalleeBits & InlineFeatureWhitelist);
  return MatchExact && MatchSubset;
}

70
71
72
73
74
75
76
77
78
79
80
81
82
83
bool ARMTTIImpl::shouldFavorBackedgeIndex(const Loop *L) const {
  if (L->getHeader()->getParent()->hasOptSize())
    return false;
  if (ST->hasMVEIntegerOps())
    return false;
  return ST->isMClass() && ST->isThumb2() && L->getNumBlocks() == 1;
}

bool ARMTTIImpl::shouldFavorPostInc() const {
  if (ST->hasMVEIntegerOps())
    return true;
  return false;
}

84
85
int ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
                              TTI::TargetCostKind CostKind) {
86
87
  assert(Ty->isIntegerTy());

88
 unsigned Bits = Ty->getPrimitiveSizeInBits();
89
 if (Bits == 0 || Imm.getActiveBits() >= 64)
90
   return 4;
91

92
93
  int64_t SImmVal = Imm.getSExtValue();
  uint64_t ZImmVal = Imm.getZExtValue();
94
95
96
97
98
99
  if (!ST->isThumb()) {
    if ((SImmVal >= 0 && SImmVal < 65536) ||
        (ARM_AM::getSOImmVal(ZImmVal) != -1) ||
        (ARM_AM::getSOImmVal(~ZImmVal) != -1))
      return 1;
    return ST->hasV6T2Ops() ? 2 : 3;
100
101
  }
  if (ST->isThumb2()) {
102
103
104
105
106
107
    if ((SImmVal >= 0 && SImmVal < 65536) ||
        (ARM_AM::getT2SOImmVal(ZImmVal) != -1) ||
        (ARM_AM::getT2SOImmVal(~ZImmVal) != -1))
      return 1;
    return ST->hasV6T2Ops() ? 2 : 3;
  }
108
109
  // Thumb1, any i8 imm cost 1.
  if (Bits == 8 || (SImmVal >= 0 && SImmVal < 256))
110
    return 1;
111
  if ((~SImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal))
112
113
114
    return 2;
  // Load from constantpool.
  return 3;
115
}
Renato Golin's avatar
Renato Golin committed
116

117
118
119
120
121
122
123
124
125
126
// Constants smaller than 256 fit in the immediate field of
// Thumb1 instructions so we return a zero cost and 1 otherwise.
int ARMTTIImpl::getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx,
                                      const APInt &Imm, Type *Ty) {
  if (Imm.isNonNegative() && Imm.getLimitedValue() < 256)
    return 0;

  return 1;
}

127
int ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm,
128
                                  Type *Ty, TTI::TargetCostKind CostKind) {
129
130
131
132
133
134
135
136
137
  // Division by a constant can be turned into multiplication, but only if we
  // know it's constant. So it's not so much that the immediate is cheap (it's
  // not), but that the alternative is worse.
  // FIXME: this is probably unneeded with GlobalISel.
  if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
       Opcode == Instruction::SRem || Opcode == Instruction::URem) &&
      Idx == 1)
    return 0;

138
139
140
141
142
  if (Opcode == Instruction::And) {
    // UXTB/UXTH
    if (Imm == 255 || Imm == 65535)
      return 0;
    // Conversion to BIC is free, and means we can use ~Imm instead.
143
144
    return std::min(getIntImmCost(Imm, Ty, CostKind),
                    getIntImmCost(~Imm, Ty, CostKind));
145
  }
146

147
148
  if (Opcode == Instruction::Add)
    // Conversion to SUB is free, and means we can use -Imm instead.
149
150
    return std::min(getIntImmCost(Imm, Ty, CostKind),
                    getIntImmCost(-Imm, Ty, CostKind));
151

152
153
154
155
156
157
158
159
160
161
162
  if (Opcode == Instruction::ICmp && Imm.isNegative() &&
      Ty->getIntegerBitWidth() == 32) {
    int64_t NegImm = -Imm.getSExtValue();
    if (ST->isThumb2() && NegImm < 1<<12)
      // icmp X, #-C -> cmn X, #C
      return 0;
    if (ST->isThumb() && NegImm < 1<<8)
      // icmp X, #-C -> adds X, #C
      return 0;
  }

163
  // xor a, -1 can always be folded to MVN
164
165
  if (Opcode == Instruction::Xor && Imm.isAllOnesValue())
    return 0;
166

167
  return getIntImmCost(Imm, Ty, CostKind);
168
169
}

170
int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
171
                                 TTI::TargetCostKind CostKind,
172
                                 const Instruction *I) {
Renato Golin's avatar
Renato Golin committed
173
174
175
  int ISD = TLI->InstructionOpcodeToISD(Opcode);
  assert(ISD && "Invalid opcode");

176
177
178
179
180
181
182
  // TODO: Allow non-throughput costs that aren't binary.
  auto AdjustCost = [&CostKind](int Cost) {
    if (CostKind != TTI::TCK_RecipThroughput)
      return Cost == 0 ? 0 : 1;
    return Cost;
  };

183
  // Single to/from double precision conversions.
184
  static const CostTblEntry NEONFltDblTbl[] = {
185
186
187
188
189
190
191
192
    // Vector fptrunc/fpext conversions.
    { ISD::FP_ROUND,   MVT::v2f64, 2 },
    { ISD::FP_EXTEND,  MVT::v2f32, 2 },
    { ISD::FP_EXTEND,  MVT::v4f32, 4 }
  };

  if (Src->isVectorTy() && ST->hasNEON() && (ISD == ISD::FP_ROUND ||
                                          ISD == ISD::FP_EXTEND)) {
193
    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
194
    if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second))
195
      return AdjustCost(LT.first * Entry->Cost);
196
197
  }

198
199
  EVT SrcTy = TLI->getValueType(DL, Src);
  EVT DstTy = TLI->getValueType(DL, Dst);
Renato Golin's avatar
Renato Golin committed
200
201

  if (!SrcTy.isSimple() || !DstTy.isSimple())
202
    return AdjustCost(BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I));
Renato Golin's avatar
Renato Golin committed
203

David Green's avatar
David Green committed
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
  // The extend of a load is free
  if (I && isa<LoadInst>(I->getOperand(0))) {
    static const TypeConversionCostTblEntry LoadConversionTbl[] = {
        {ISD::SIGN_EXTEND, MVT::i32, MVT::i16, 0},
        {ISD::ZERO_EXTEND, MVT::i32, MVT::i16, 0},
        {ISD::SIGN_EXTEND, MVT::i32, MVT::i8, 0},
        {ISD::ZERO_EXTEND, MVT::i32, MVT::i8, 0},
        {ISD::SIGN_EXTEND, MVT::i16, MVT::i8, 0},
        {ISD::ZERO_EXTEND, MVT::i16, MVT::i8, 0},
        {ISD::SIGN_EXTEND, MVT::i64, MVT::i32, 1},
        {ISD::ZERO_EXTEND, MVT::i64, MVT::i32, 1},
        {ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 1},
        {ISD::ZERO_EXTEND, MVT::i64, MVT::i16, 1},
        {ISD::SIGN_EXTEND, MVT::i64, MVT::i8, 1},
        {ISD::ZERO_EXTEND, MVT::i64, MVT::i8, 1},
    };
    if (const auto *Entry = ConvertCostTableLookup(
            LoadConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
222
      return AdjustCost(Entry->Cost);
David Green's avatar
David Green committed
223
224
225
226
227
228
229
230
231
232
233
234
235

    static const TypeConversionCostTblEntry MVELoadConversionTbl[] = {
        {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0},
        {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0},
        {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 0},
        {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 0},
        {ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 0},
        {ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 0},
    };
    if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
      if (const auto *Entry =
              ConvertCostTableLookup(MVELoadConversionTbl, ISD,
                                     DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
236
        return AdjustCost(Entry->Cost);
David Green's avatar
David Green committed
237
    }
David Green's avatar
David Green committed
238
239
  }

240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
  // NEON vector operations that can extend their inputs.
  if ((ISD == ISD::SIGN_EXTEND || ISD == ISD::ZERO_EXTEND) &&
      I && I->hasOneUse() && ST->hasNEON() && SrcTy.isVector()) {
    static const TypeConversionCostTblEntry NEONDoubleWidthTbl[] = {
      // vaddl
      { ISD::ADD, MVT::v4i32, MVT::v4i16, 0 },
      { ISD::ADD, MVT::v8i16, MVT::v8i8,  0 },
      // vsubl
      { ISD::SUB, MVT::v4i32, MVT::v4i16, 0 },
      { ISD::SUB, MVT::v8i16, MVT::v8i8,  0 },
      // vmull
      { ISD::MUL, MVT::v4i32, MVT::v4i16, 0 },
      { ISD::MUL, MVT::v8i16, MVT::v8i8,  0 },
      // vshll
      { ISD::SHL, MVT::v4i32, MVT::v4i16, 0 },
      { ISD::SHL, MVT::v8i16, MVT::v8i8,  0 },
    };

    auto *User = cast<Instruction>(*I->user_begin());
    int UserISD = TLI->InstructionOpcodeToISD(User->getOpcode());
    if (auto *Entry = ConvertCostTableLookup(NEONDoubleWidthTbl, UserISD,
                                             DstTy.getSimpleVT(),
                                             SrcTy.getSimpleVT())) {
263
      return AdjustCost(Entry->Cost);
264
265
266
    }
  }

Renato Golin's avatar
Renato Golin committed
267
  // Some arithmetic, load and store operations have specific instructions
268
269
  // to cast up/down their types automatically at no extra cost.
  // TODO: Get these tables to know at least what the related operations are.
270
  static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = {
271
272
    { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
    { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
Renato Golin's avatar
Renato Golin committed
273
274
275
276
    { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
    { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
    { ISD::TRUNCATE,    MVT::v4i32, MVT::v4i64, 0 },
    { ISD::TRUNCATE,    MVT::v4i16, MVT::v4i32, 1 },
277

278
    // The number of vmovl instructions for the extension.
279
280
281
282
283
284
285
286
    { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8,  1 },
    { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8,  1 },
    { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8,  2 },
    { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8,  2 },
    { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8,  3 },
    { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8,  3 },
    { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
    { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
287
288
289
290
291
292
293
294
295
296
297
    { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
    { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
    { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
    { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
    { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
    { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
    { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
    { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },

298
299
300
    // Operations that we legalize using splitting.
    { ISD::TRUNCATE,    MVT::v16i8, MVT::v16i32, 6 },
    { ISD::TRUNCATE,    MVT::v8i8, MVT::v8i32, 3 },
301

302
303
304
    // Vector float <-> i32 conversions.
    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i32, 1 },
    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i32, 1 },
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326

    { ISD::SINT_TO_FP,  MVT::v2f32, MVT::v2i8, 3 },
    { ISD::UINT_TO_FP,  MVT::v2f32, MVT::v2i8, 3 },
    { ISD::SINT_TO_FP,  MVT::v2f32, MVT::v2i16, 2 },
    { ISD::UINT_TO_FP,  MVT::v2f32, MVT::v2i16, 2 },
    { ISD::SINT_TO_FP,  MVT::v2f32, MVT::v2i32, 1 },
    { ISD::UINT_TO_FP,  MVT::v2f32, MVT::v2i32, 1 },
    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i1, 3 },
    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i1, 3 },
    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i8, 3 },
    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i8, 3 },
    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i16, 2 },
    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i16, 2 },
    { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i16, 4 },
    { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i16, 4 },
    { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i32, 2 },
    { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i32, 2 },
    { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i16, 8 },
    { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i16, 8 },
    { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i32, 4 },
    { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i32, 4 },

327
328
    { ISD::FP_TO_SINT,  MVT::v4i32, MVT::v4f32, 1 },
    { ISD::FP_TO_UINT,  MVT::v4i32, MVT::v4f32, 1 },
329
330
331
332
    { ISD::FP_TO_SINT,  MVT::v4i8, MVT::v4f32, 3 },
    { ISD::FP_TO_UINT,  MVT::v4i8, MVT::v4f32, 3 },
    { ISD::FP_TO_SINT,  MVT::v4i16, MVT::v4f32, 2 },
    { ISD::FP_TO_UINT,  MVT::v4i16, MVT::v4f32, 2 },
333
334
335
336

    // Vector double <-> i32 conversions.
    { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
    { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
337
338
339
340
341
342
343
344

    { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i8, 4 },
    { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i8, 4 },
    { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i16, 3 },
    { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i16, 3 },
    { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
    { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },

345
    { ISD::FP_TO_SINT,  MVT::v2i32, MVT::v2f64, 2 },
346
347
348
349
350
    { ISD::FP_TO_UINT,  MVT::v2i32, MVT::v2f64, 2 },
    { ISD::FP_TO_SINT,  MVT::v8i16, MVT::v8f32, 4 },
    { ISD::FP_TO_UINT,  MVT::v8i16, MVT::v8f32, 4 },
    { ISD::FP_TO_SINT,  MVT::v16i16, MVT::v16f32, 8 },
    { ISD::FP_TO_UINT,  MVT::v16i16, MVT::v16f32, 8 }
Renato Golin's avatar
Renato Golin committed
351
352
  };

353
  if (SrcTy.isVector() && ST->hasNEON()) {
354
355
356
    if (const auto *Entry = ConvertCostTableLookup(NEONVectorConversionTbl, ISD,
                                                   DstTy.getSimpleVT(),
                                                   SrcTy.getSimpleVT()))
357
      return AdjustCost(Entry->Cost);
358
359
360
  }

  // Scalar float to integer conversions.
361
  static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = {
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
    { ISD::FP_TO_SINT,  MVT::i1, MVT::f32, 2 },
    { ISD::FP_TO_UINT,  MVT::i1, MVT::f32, 2 },
    { ISD::FP_TO_SINT,  MVT::i1, MVT::f64, 2 },
    { ISD::FP_TO_UINT,  MVT::i1, MVT::f64, 2 },
    { ISD::FP_TO_SINT,  MVT::i8, MVT::f32, 2 },
    { ISD::FP_TO_UINT,  MVT::i8, MVT::f32, 2 },
    { ISD::FP_TO_SINT,  MVT::i8, MVT::f64, 2 },
    { ISD::FP_TO_UINT,  MVT::i8, MVT::f64, 2 },
    { ISD::FP_TO_SINT,  MVT::i16, MVT::f32, 2 },
    { ISD::FP_TO_UINT,  MVT::i16, MVT::f32, 2 },
    { ISD::FP_TO_SINT,  MVT::i16, MVT::f64, 2 },
    { ISD::FP_TO_UINT,  MVT::i16, MVT::f64, 2 },
    { ISD::FP_TO_SINT,  MVT::i32, MVT::f32, 2 },
    { ISD::FP_TO_UINT,  MVT::i32, MVT::f32, 2 },
    { ISD::FP_TO_SINT,  MVT::i32, MVT::f64, 2 },
    { ISD::FP_TO_UINT,  MVT::i32, MVT::f64, 2 },
    { ISD::FP_TO_SINT,  MVT::i64, MVT::f32, 10 },
    { ISD::FP_TO_UINT,  MVT::i64, MVT::f32, 10 },
    { ISD::FP_TO_SINT,  MVT::i64, MVT::f64, 10 },
    { ISD::FP_TO_UINT,  MVT::i64, MVT::f64, 10 }
  };
  if (SrcTy.isFloatingPoint() && ST->hasNEON()) {
384
385
386
    if (const auto *Entry = ConvertCostTableLookup(NEONFloatConversionTbl, ISD,
                                                   DstTy.getSimpleVT(),
                                                   SrcTy.getSimpleVT()))
387
      return AdjustCost(Entry->Cost);
388
389
390
  }

  // Scalar integer to float conversions.
391
  static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = {
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
    { ISD::SINT_TO_FP,  MVT::f32, MVT::i1, 2 },
    { ISD::UINT_TO_FP,  MVT::f32, MVT::i1, 2 },
    { ISD::SINT_TO_FP,  MVT::f64, MVT::i1, 2 },
    { ISD::UINT_TO_FP,  MVT::f64, MVT::i1, 2 },
    { ISD::SINT_TO_FP,  MVT::f32, MVT::i8, 2 },
    { ISD::UINT_TO_FP,  MVT::f32, MVT::i8, 2 },
    { ISD::SINT_TO_FP,  MVT::f64, MVT::i8, 2 },
    { ISD::UINT_TO_FP,  MVT::f64, MVT::i8, 2 },
    { ISD::SINT_TO_FP,  MVT::f32, MVT::i16, 2 },
    { ISD::UINT_TO_FP,  MVT::f32, MVT::i16, 2 },
    { ISD::SINT_TO_FP,  MVT::f64, MVT::i16, 2 },
    { ISD::UINT_TO_FP,  MVT::f64, MVT::i16, 2 },
    { ISD::SINT_TO_FP,  MVT::f32, MVT::i32, 2 },
    { ISD::UINT_TO_FP,  MVT::f32, MVT::i32, 2 },
    { ISD::SINT_TO_FP,  MVT::f64, MVT::i32, 2 },
    { ISD::UINT_TO_FP,  MVT::f64, MVT::i32, 2 },
    { ISD::SINT_TO_FP,  MVT::f32, MVT::i64, 10 },
    { ISD::UINT_TO_FP,  MVT::f32, MVT::i64, 10 },
    { ISD::SINT_TO_FP,  MVT::f64, MVT::i64, 10 },
    { ISD::UINT_TO_FP,  MVT::f64, MVT::i64, 10 }
  };

  if (SrcTy.isInteger() && ST->hasNEON()) {
415
416
417
    if (const auto *Entry = ConvertCostTableLookup(NEONIntegerConversionTbl,
                                                   ISD, DstTy.getSimpleVT(),
                                                   SrcTy.getSimpleVT()))
418
      return AdjustCost(Entry->Cost);
Renato Golin's avatar
Renato Golin committed
419
420
  }

David Green's avatar
David Green committed
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
  // MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one
  // instruction, i8->i32 is two. i64 zexts are an VAND with a constant, sext
  // are linearised so take more.
  static const TypeConversionCostTblEntry MVEVectorConversionTbl[] = {
    { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
    { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
    { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
    { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
    { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 10 },
    { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 2 },
    { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
    { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
    { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 10 },
    { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
    { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 8 },
    { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 2 },
  };

  if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
    if (const auto *Entry = ConvertCostTableLookup(MVEVectorConversionTbl,
                                                   ISD, DstTy.getSimpleVT(),
                                                   SrcTy.getSimpleVT()))
443
      return AdjustCost(Entry->Cost * ST->getMVEVectorCostFactor());
David Green's avatar
David Green committed
444
445
  }

446
  // Scalar integer conversion costs.
447
  static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = {
448
449
450
451
452
453
454
455
456
457
458
    // i16 -> i64 requires two dependent operations.
    { ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 2 },

    // Truncates on i64 are assumed to be free.
    { ISD::TRUNCATE,    MVT::i32, MVT::i64, 0 },
    { ISD::TRUNCATE,    MVT::i16, MVT::i64, 0 },
    { ISD::TRUNCATE,    MVT::i8,  MVT::i64, 0 },
    { ISD::TRUNCATE,    MVT::i1,  MVT::i64, 0 }
  };

  if (SrcTy.isInteger()) {
459
460
461
    if (const auto *Entry = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD,
                                                   DstTy.getSimpleVT(),
                                                   SrcTy.getSimpleVT()))
462
      return AdjustCost(Entry->Cost);
463
464
  }

465
466
467
  int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
                     ? ST->getMVEVectorCostFactor()
                     : 1;
468
469
  return AdjustCost(
    BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I));
Renato Golin's avatar
Renato Golin committed
470
}
471

472
473
int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
                                   unsigned Index) {
474
475
  // Penalize inserting into an D-subregister. We end up with a three times
  // lower estimated throughput on swift.
476
477
  if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement &&
      ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32)
478
    return 3;
479

480
481
  if (ST->hasNEON() && (Opcode == Instruction::InsertElement ||
                        Opcode == Instruction::ExtractElement)) {
482
483
    // Cross-class copies are expensive on many microarchitectures,
    // so assume they are expensive by default.
484
    if (cast<VectorType>(ValTy)->getElementType()->isIntegerTy())
485
486
487
488
489
490
491
492
      return 3;

    // Even if it's not a cross class copy, this likely leads to mixing
    // of NEON and VFP code and should be therefore penalized.
    if (ValTy->isVectorTy() &&
        ValTy->getScalarSizeInBits() <= 32)
      return std::max(BaseT::getVectorInstrCost(Opcode, ValTy, Index), 2U);
  }
493

494
495
496
497
498
499
500
501
  if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement ||
                                 Opcode == Instruction::ExtractElement)) {
    // We say MVE moves costs at least the MVEVectorCostFactor, even though
    // they are scalar instructions. This helps prevent mixing scalar and
    // vector, to prevent vectorising where we end up just scalarising the
    // result anyway.
    return std::max(BaseT::getVectorInstrCost(Opcode, ValTy, Index),
                    ST->getMVEVectorCostFactor()) *
502
           cast<FixedVectorType>(ValTy)->getNumElements() / 2;
503
504
  }

505
  return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
506
}
507

508
int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
509
                                   TTI::TargetCostKind CostKind,
510
                                   const Instruction *I) {
511
512
513
514
  // TODO: Handle other cost kinds.
  if (CostKind != TTI::TCK_RecipThroughput)
    return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind, I);

515
  int ISD = TLI->InstructionOpcodeToISD(Opcode);
516
  // On NEON a vector select gets lowered to vbsl.
517
  if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT) {
518
    // Lowering of some vector selects is currently far from perfect.
519
    static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = {
520
521
522
523
524
      { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 },
      { ISD::SELECT, MVT::v8i1, MVT::v8i64, 50 },
      { ISD::SELECT, MVT::v16i1, MVT::v16i64, 100 }
    };

525
526
    EVT SelCondTy = TLI->getValueType(DL, CondTy);
    EVT SelValTy = TLI->getValueType(DL, ValTy);
527
    if (SelCondTy.isSimple() && SelValTy.isSimple()) {
528
529
530
531
      if (const auto *Entry = ConvertCostTableLookup(NEONVectorSelectTbl, ISD,
                                                     SelCondTy.getSimpleVT(),
                                                     SelValTy.getSimpleVT()))
        return Entry->Cost;
532
    }
533

534
    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
535
536
537
    return LT.first;
  }

538
539
540
  int BaseCost = ST->hasMVEIntegerOps() && ValTy->isVectorTy()
                     ? ST->getMVEVectorCostFactor()
                     : 1;
541
542
  return BaseCost * BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind,
                                              I);
543
}
544

545
546
int ARMTTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
                                          const SCEV *Ptr) {
547
548
549
550
551
  // Address computations in vectorized code with non-consecutive addresses will
  // likely result in more instructions compared to scalar code where the
  // computation can more often be merged into the index mode. The resulting
  // extra micro-ops can significantly decrease throughput.
  unsigned NumVectorInstToHideOverhead = 10;
552
  int MaxMergeDistance = 64;
553

554
555
556
557
  if (ST->hasNEON()) {
    if (Ty->isVectorTy() && SE &&
        !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
      return NumVectorInstToHideOverhead;
558

559
560
561
562
563
    // In many cases the address computation is not merged into the instruction
    // addressing mode.
    return 1;
  }
  return BaseT::getAddressComputationCost(Ty, SE, Ptr);
564
}
565

566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
bool ARMTTIImpl::isProfitableLSRChainElement(Instruction *I) {
  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
    // If a VCTP is part of a chain, it's already profitable and shouldn't be
    // optimized, else LSR may block tail-predication.
    switch (II->getIntrinsicID()) {
    case Intrinsic::arm_mve_vctp8:
    case Intrinsic::arm_mve_vctp16:
    case Intrinsic::arm_mve_vctp32:
    case Intrinsic::arm_mve_vctp64:
      return true;
    default:
      break;
    }
  }
  return false;
}

583
bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, MaybeAlign Alignment) {
David Green's avatar
David Green committed
584
585
586
  if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps())
    return false;

587
  if (auto *VecTy = dyn_cast<FixedVectorType>(DataTy)) {
588
589
590
591
592
593
594
    // Don't support v2i1 yet.
    if (VecTy->getNumElements() == 2)
      return false;

    // We don't support extending fp types.
     unsigned VecWidth = DataTy->getPrimitiveSizeInBits();
    if (VecWidth != 128 && VecTy->getElementType()->isFloatingPointTy())
David Green's avatar
David Green committed
595
596
597
598
      return false;
  }

  unsigned EltWidth = DataTy->getScalarSizeInBits();
599
600
  return (EltWidth == 32 && (!Alignment || *Alignment >= 4)) ||
         (EltWidth == 16 && (!Alignment || *Alignment >= 2)) ||
601
         (EltWidth == 8);
David Green's avatar
David Green committed
602
603
}

604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, MaybeAlign Alignment) {
  if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps())
    return false;

  // This method is called in 2 places:
  //  - from the vectorizer with a scalar type, in which case we need to get
  //  this as good as we can with the limited info we have (and rely on the cost
  //  model for the rest).
  //  - from the masked intrinsic lowering pass with the actual vector type.
  // For MVE, we have a custom lowering pass that will already have custom
  // legalised any gathers that we can to MVE intrinsics, and want to expand all
  // the rest. The pass runs before the masked intrinsic lowering pass, so if we
  // are here, we know we want to expand.
  if (isa<VectorType>(Ty))
    return false;

  unsigned EltWidth = Ty->getScalarSizeInBits();
621
622
  return ((EltWidth == 32 && (!Alignment || *Alignment >= 4)) ||
          (EltWidth == 16 && (!Alignment || *Alignment >= 2)) || EltWidth == 8);
623
624
}

625
626
627
628
629
630
631
632
633
634
635
636
637
638
int ARMTTIImpl::getMemcpyCost(const Instruction *I) {
  const MemCpyInst *MI = dyn_cast<MemCpyInst>(I);
  assert(MI && "MemcpyInst expected");
  ConstantInt *C = dyn_cast<ConstantInt>(MI->getLength());

  // To model the cost of a library call, we assume 1 for the call, and
  // 3 for the argument setup.
  const unsigned LibCallCost = 4;

  // If 'size' is not a constant, a library call will be generated.
  if (!C)
    return LibCallCost;

  const unsigned Size = C->getValue().getZExtValue();
639
640
  const Align DstAlign = *MI->getDestAlign();
  const Align SrcAlign = *MI->getSourceAlign();
641
642
643
644
645
646
647
648
  const Function *F = I->getParent()->getParent();
  const unsigned Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize());
  std::vector<EVT> MemOps;

  // MemOps will be poplulated with a list of data types that needs to be
  // loaded and stored. That's why we multiply the number of elements by 2 to
  // get the cost for this memcpy.
  if (getTLI()->findOptimalMemOpLowering(
649
650
651
          MemOps, Limit,
          MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign,
                      /*IsVolatile*/ true),
652
653
654
655
656
657
658
659
          MI->getDestAddressSpace(), MI->getSourceAddressSpace(),
          F->getAttributes()))
    return MemOps.size() * 2;

  // If we can't find an optimal memop lowering, return the default cost
  return LibCallCost;
}

660
661
int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
                               int Index, VectorType *SubTp) {
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
  if (ST->hasNEON()) {
    if (Kind == TTI::SK_Broadcast) {
      static const CostTblEntry NEONDupTbl[] = {
          // VDUP handles these cases.
          {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
          {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
          {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
          {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
          {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
          {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},

          {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
          {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
          {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
          {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}};

      std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);

      if (const auto *Entry =
              CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, LT.second))
        return LT.first * Entry->Cost;
    }
    if (Kind == TTI::SK_Reverse) {
      static const CostTblEntry NEONShuffleTbl[] = {
          // Reverse shuffle cost one instruction if we are shuffling within a
          // double word (vrev) or two if we shuffle a quad word (vrev, vext).
          {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
          {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
          {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
          {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
          {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
          {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},

          {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
          {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
          {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2},
          {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}};

      std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);

      if (const auto *Entry =
              CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
        return LT.first * Entry->Cost;
    }
    if (Kind == TTI::SK_Select) {
      static const CostTblEntry NEONSelShuffleTbl[] = {
          // Select shuffle cost table for ARM. Cost is the number of
          // instructions
          // required to create the shuffled vector.
711

712
713
714
715
          {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
          {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
          {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
          {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
716

717
718
719
          {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
          {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
          {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2},
720

721
          {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16},
722

723
          {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}};
724

725
726
727
728
729
      std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
      if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl,
                                              ISD::VECTOR_SHUFFLE, LT.second))
        return LT.first * Entry->Cost;
    }
730
  }
731
732
733
734
735
736
737
738
739
740
741
742
743
744
  if (ST->hasMVEIntegerOps()) {
    if (Kind == TTI::SK_Broadcast) {
      static const CostTblEntry MVEDupTbl[] = {
          // VDUP handles these cases.
          {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
          {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
          {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1},
          {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
          {ISD::VECTOR_SHUFFLE, MVT::v8f16, 1}};

      std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);

      if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE,
                                              LT.second))
745
        return LT.first * Entry->Cost * ST->getMVEVectorCostFactor();
746
747
    }
  }
748
749
750
751
  int BaseCost = ST->hasMVEIntegerOps() && Tp->isVectorTy()
                     ? ST->getMVEVectorCostFactor()
                     : 1;
  return BaseCost * BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
752
}
753

754
int ARMTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
755
                                       TTI::TargetCostKind CostKind,
756
757
758
759
760
761
                                       TTI::OperandValueKind Op1Info,
                                       TTI::OperandValueKind Op2Info,
                                       TTI::OperandValueProperties Opd1PropInfo,
                                       TTI::OperandValueProperties Opd2PropInfo,
                                       ArrayRef<const Value *> Args,
                                       const Instruction *CxtI) {
762
763
764
765
766
767
  // TODO: Handle more cost kinds.
  if (CostKind != TTI::TCK_RecipThroughput)
    return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
                                         Op2Info, Opd1PropInfo,
                                         Opd2PropInfo, Args, CxtI);

768
  int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
769
  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
770

771
  if (ST->hasNEON()) {
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
    const unsigned FunctionCallDivCost = 20;
    const unsigned ReciprocalDivCost = 10;
    static const CostTblEntry CostTbl[] = {
      // Division.
      // These costs are somewhat random. Choose a cost of 20 to indicate that
      // vectorizing devision (added function call) is going to be very expensive.
      // Double registers types.
      { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost},
      { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost},
      { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost},
      { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost},
      { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost},
      { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost},
      { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost},
      { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost},
      { ISD::SDIV, MVT::v4i16,     ReciprocalDivCost},
      { ISD::UDIV, MVT::v4i16,     ReciprocalDivCost},
      { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost},
      { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost},
      { ISD::SDIV, MVT::v8i8,      ReciprocalDivCost},
      { ISD::UDIV, MVT::v8i8,      ReciprocalDivCost},
      { ISD::SREM, MVT::v8i8,  8 * FunctionCallDivCost},
      { ISD::UREM, MVT::v8i8,  8 * FunctionCallDivCost},
      // Quad register types.
      { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost},
      { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost},
      { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost},
      { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost},
      { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost},
      { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost},
      { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost},
      { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost},
      { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost},
      { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost},
      { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost},
      { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost},
      { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost},
      { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost},
      { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost},
      { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost},
      // Multiplication.
    };

815
816
    if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second))
      return LT.first * Entry->Cost;
817

818
819
    int Cost = BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
                                             Op2Info,
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
                                             Opd1PropInfo, Opd2PropInfo);

    // This is somewhat of a hack. The problem that we are facing is that SROA
    // creates a sequence of shift, and, or instructions to construct values.
    // These sequences are recognized by the ISel and have zero-cost. Not so for
    // the vectorized code. Because we have support for v2i64 but not i64 those
    // sequences look particularly beneficial to vectorize.
    // To work around this we increase the cost of v2i64 operations to make them
    // seem less beneficial.
    if (LT.second == MVT::v2i64 &&
        Op2Info == TargetTransformInfo::OK_UniformConstantValue)
      Cost += 4;

    return Cost;
  }

836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
  // If this operation is a shift on arm/thumb2, it might well be folded into
  // the following instruction, hence having a cost of 0.
  auto LooksLikeAFreeShift = [&]() {
    if (ST->isThumb1Only() || Ty->isVectorTy())
      return false;

    if (!CxtI || !CxtI->hasOneUse() || !CxtI->isShift())
      return false;
    if (Op2Info != TargetTransformInfo::OK_UniformConstantValue)
      return false;

    // Folded into a ADC/ADD/AND/BIC/CMP/EOR/MVN/ORR/ORN/RSB/SBC/SUB
    switch (cast<Instruction>(CxtI->user_back())->getOpcode()) {
    case Instruction::Add:
    case Instruction::Sub:
    case Instruction::And:
    case Instruction::Xor:
    case Instruction::Or:
    case Instruction::ICmp:
      return true;
    default:
      return false;
    }
  };
  if (LooksLikeAFreeShift())
    return 0;

863
864
865
866
867
868
869
870
871
872
873
874
  int BaseCost = ST->hasMVEIntegerOps() && Ty->isVectorTy()
                     ? ST->getMVEVectorCostFactor()
                     : 1;

  // The rest of this mostly follows what is done in BaseT::getArithmeticInstrCost,
  // without treating floats as more expensive that scalars or increasing the
  // costs for custom operations. The results is also multiplied by the
  // MVEVectorCostFactor where appropriate.
  if (TLI->isOperationLegalOrCustomOrPromote(ISDOpcode, LT.second))
    return LT.first * BaseCost;

  // Else this is expand, assume that we need to scalarize this op.
875
  if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
876
    unsigned Num = VTy->getNumElements();
877
878
    unsigned Cost = getArithmeticInstrCost(Opcode, Ty->getScalarType(),
                                           CostKind);
879
880
    // Return the cost of multiple scalar invocation plus the cost of
    // inserting and extracting the values.
881
    return BaseT::getScalarizationOverhead(VTy, Args) + Num * Cost;
882
883
884
  }

  return BaseCost;
885
886
}

887
888
int ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
                                MaybeAlign Alignment, unsigned AddressSpace,
889
                                TTI::TargetCostKind CostKind,
890
                                const Instruction *I) {
Sam Parker's avatar
Sam Parker committed
891
892
893
894
  // TODO: Handle other cost kinds.
  if (CostKind != TTI::TCK_RecipThroughput)
    return 1;

895
896
897
898
899
  // Type legalization can't handle structs
  if (TLI->getValueType(DL, Src,  true) == MVT::Other)
    return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
                                  CostKind);

900
  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
901

902
903
  if (ST->hasNEON() && Src->isVectorTy() &&
      (Alignment && *Alignment != Align(16)) &&
904
      cast<VectorType>(Src)->getElementType()->isDoubleTy()) {
905
906
907
908
    // Unaligned loads/stores are extremely inefficient.
    // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
    return LT.first * 4;
  }
909
910
911
912
  int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
                     ? ST->getMVEVectorCostFactor()
                     : 1;
  return BaseCost * LT.first;
913
}
914

915
916
int ARMTTIImpl::getInterleavedMemoryOpCost(
    unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
917
918
919
    unsigned Alignment, unsigned AddressSpace,
    TTI::TargetCostKind CostKind,
    bool UseMaskForCond, bool UseMaskForGaps) {
920
921
922
923
  assert(Factor >= 2 && "Invalid interleave factor");
  assert(isa<VectorType>(VecTy) && "Expect a vector type");

  // vldN/vstN doesn't support vector types of i64/f64 element.
924
  bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64;
925

926
  if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits &&
927
      !UseMaskForCond && !UseMaskForGaps) {
928
929
930
    unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements();
    auto *SubVecTy =
        FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor);
931
932

    // vldN/vstN only support legal vector types of size 64 or 128 in bits.
933
934
    // Accesses having vector types that are a multiple of 128 bits can be
    // matched to more than one vldN/vstN instruction.
935
    int BaseCost = ST->hasMVEIntegerOps() ? ST->getMVEVectorCostFactor() : 1;
936
    if (NumElts % Factor == 0 &&
937
938
939
940
941
942
943
944
945
946
947
        TLI->isLegalInterleavedAccessType(Factor, SubVecTy, DL))
      return Factor * BaseCost * TLI->getNumInterleavedAccesses(SubVecTy, DL);

    // Some smaller than legal interleaved patterns are cheap as we can make
    // use of the vmovn or vrev patterns to interleave a standard load. This is
    // true for v4i8, v8i8 and v4i16 at least (but not for v4f16 as it is
    // promoted differently). The cost of 2 here is then a load and vrev or
    // vmovn.
    if (ST->hasMVEIntegerOps() && Factor == 2 && NumElts / Factor > 2 &&
        VecTy->isIntOrIntVectorTy() && DL.getTypeSizeInBits(SubVecTy) <= 64)
      return 2 * BaseCost;
948
949
950
  }

  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
951
                                           Alignment, AddressSpace, CostKind,
952
                                           UseMaskForCond, UseMaskForGaps);
953
}
954

955
956
unsigned ARMTTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
                                            Value *Ptr, bool VariableMask,
957
                                            unsigned Alignment,
958
                                            TTI::TargetCostKind CostKind,
959
960
                                            const Instruction *I) {
  using namespace PatternMatch;
961
962
  if (!ST->hasMVEIntegerOps() || !EnableMaskedGatherScatters)
    return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
963
                                         Alignment, CostKind, I);
964
965

  assert(DataTy->isVectorTy() && "Can't do gather/scatters on scalar!");
966
  auto *VTy = cast<FixedVectorType>(DataTy);
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982

  // TODO: Splitting, once we do that.

  unsigned NumElems = VTy->getNumElements();
  unsigned EltSize = VTy->getScalarSizeInBits();
  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, DataTy);

  // For now, it is assumed that for the MVE gather instructions the loads are
  // all effectively serialised. This means the cost is the scalar cost
  // multiplied by the number of elements being loaded. This is possibly very
  // conservative, but even so we still end up vectorising loops because the
  // cost per iteration for many loops is lower than for scalar loops.
  unsigned VectorCost = NumElems * LT.first;
  // The scalarization cost should be a lot higher. We use the number of vector
  // elements plus the scalarization overhead.
  unsigned ScalarCost =
983
      NumElems * LT.first + BaseT::getScalarizationOverhead(VTy, {});
984
985
986
987

  if (Alignment < EltSize / 8)
    return ScalarCost;

988
989
990
991
992
993
994
995
996
997
998
999
1000
  unsigned ExtSize = EltSize;
  // Check whether there's a single user that asks for an extended type
  if (I != nullptr) {
    // Dependent of the caller of this function, a gather instruction will
    // either have opcode Instruction::Load or be a call to the masked_gather
    // intrinsic
    if ((I->getOpcode() == Instruction::Load ||
         match(I, m_Intrinsic<Intrinsic::masked_gather>())) &&
        I->hasOneUse()) {
      const User *Us = *I->users().begin();
      if (isa<ZExtInst>(Us) || isa<SExtInst>(Us)) {
        // only allow valid type combinations
        unsigned TypeSize =
For faster browsing, not all history is shown. View entire blame