Commit 090cae84 authored by Simon Pilgrim's avatar Simon Pilgrim
Browse files

[TTI] Add DemandedElts to getScalarizationOverhead

The improvements to the x86 vector insert/extract element costs in D74976 resulted in the estimated costs for vector initialization and scalarization increasing higher than should be expected. This is particularly noticeable on pre-SSE4 targets where the available of legal INSERT_VECTOR_ELT ops is more limited.

This patch does 2 things:
1 - it implements X86TTIImpl::getScalarizationOverhead to more accurately represent the typical costs of a ISD::BUILD_VECTOR pattern.
2 - it adds a DemandedElts mask to getScalarizationOverhead to permit the SLP's BoUpSLP::getGatherCost to be rewritten to use it directly instead of accumulating raw vector insertion costs.

This fixes PR45418 where a v4i8 (zext'd to v4i32) was no longer vectorizing.

A future patch should extend X86TTIImpl::getScalarizationOverhead to tweak the EXTRACT_VECTOR_ELT scalarization costs as well.

Reviewed By: @craig.topper

Differential Revision: https://reviews.llvm.org/D78216
parent 42a56bf6
......@@ -611,8 +611,15 @@ public:
/// should use coldcc calling convention.
bool useColdCCForColdCall(Function &F) const;
unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
/// Estimate the overhead of scalarizing an instruction. Insert and Extract
/// are set if the demanded result elements need to be inserted and/or
/// extracted from vectors.
unsigned getScalarizationOverhead(Type *Ty, const APInt &DemandedElts,
bool Insert, bool Extract) const;
/// Estimate the overhead of scalarizing an instructions unique
/// non-constant operands. The types of the arguments are ordinarily
/// scalar, in which case the costs are multiplied with VF.
unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
unsigned VF) const;
......@@ -1231,8 +1238,8 @@ public:
virtual bool shouldBuildLookupTables() = 0;
virtual bool shouldBuildLookupTablesForConstant(Constant *C) = 0;
virtual bool useColdCCForColdCall(Function &F) = 0;
virtual unsigned getScalarizationOverhead(Type *Ty, bool Insert,
bool Extract) = 0;
virtual unsigned getScalarizationOverhead(Type *Ty, const APInt &DemandedElts,
bool Insert, bool Extract) = 0;
virtual unsigned
getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
unsigned VF) = 0;
......@@ -1556,9 +1563,9 @@ public:
return Impl.useColdCCForColdCall(F);
}
unsigned getScalarizationOverhead(Type *Ty, bool Insert,
bool Extract) override {
return Impl.getScalarizationOverhead(Ty, Insert, Extract);
unsigned getScalarizationOverhead(Type *Ty, const APInt &DemandedElts,
bool Insert, bool Extract) override {
return Impl.getScalarizationOverhead(Ty, DemandedElts, Insert, Extract);
}
unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
unsigned VF) override {
......
......@@ -239,7 +239,8 @@ public:
bool useColdCCForColdCall(Function &F) { return false; }
unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) {
unsigned getScalarizationOverhead(Type *Ty, const APInt &DemandedElts,
bool Insert, bool Extract) {
return 0;
}
......
......@@ -548,12 +548,19 @@ public:
unsigned getRegisterBitWidth(bool Vector) const { return 32; }
/// Estimate the overhead of scalarizing an instruction. Insert and Extract
/// are set if the result needs to be inserted and/or extracted from vectors.
unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) {
/// are set if the demanded result elements need to be inserted and/or
/// extracted from vectors.
unsigned getScalarizationOverhead(Type *Ty, const APInt &DemandedElts,
bool Insert, bool Extract) {
auto *VTy = cast<VectorType>(Ty);
assert(DemandedElts.getBitWidth() == VTy->getNumElements() &&
"Vector size mismatch");
unsigned Cost = 0;
for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
if (!DemandedElts[i])
continue;
if (Insert)
Cost += static_cast<T *>(this)->getVectorInstrCost(
Instruction::InsertElement, VTy, i);
......@@ -565,6 +572,14 @@ public:
return Cost;
}
/// Helper wrapper for the DemandedElts variant of getScalarizationOverhead.
unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) {
auto *VTy = cast<VectorType>(Ty);
APInt DemandedElts = APInt::getAllOnesValue(VTy->getNumElements());
return static_cast<T *>(this)->getScalarizationOverhead(Ty, DemandedElts,
Insert, Extract);
}
/// Estimate the overhead of scalarizing an instructions unique
/// non-constant operands. The types of the arguments are ordinarily
/// scalar, in which case the costs are multiplied with VF.
......
......@@ -368,9 +368,9 @@ bool TargetTransformInfo::useColdCCForColdCall(Function &F) const {
return TTIImpl->useColdCCForColdCall(F);
}
unsigned TargetTransformInfo::getScalarizationOverhead(Type *Ty, bool Insert,
bool Extract) const {
return TTIImpl->getScalarizationOverhead(Ty, Insert, Extract);
unsigned TargetTransformInfo::getScalarizationOverhead(
Type *Ty, const APInt &DemandedElts, bool Insert, bool Extract) const {
return TTIImpl->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract);
}
unsigned TargetTransformInfo::getOperandsScalarizationOverhead(
......
......@@ -115,9 +115,10 @@ unsigned HexagonTTIImpl::getMinimumVF(unsigned ElemWidth) const {
return (8 * ST.getVectorLength()) / ElemWidth;
}
unsigned HexagonTTIImpl::getScalarizationOverhead(Type *Ty, bool Insert,
bool Extract) {
return BaseT::getScalarizationOverhead(Ty, Insert, Extract);
unsigned HexagonTTIImpl::getScalarizationOverhead(Type *Ty,
const APInt &DemandedElts,
bool Insert, bool Extract) {
return BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract);
}
unsigned HexagonTTIImpl::getOperandsScalarizationOverhead(
......
......@@ -101,9 +101,10 @@ public:
return true;
}
unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract);
unsigned getOperandsScalarizationOverhead(ArrayRef<const Value*> Args,
unsigned VF);
unsigned getScalarizationOverhead(Type *Ty, const APInt &DemandedElts,
bool Insert, bool Extract);
unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
unsigned VF);
unsigned getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type*> Tys);
unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
ArrayRef<Value *> Args, FastMathFlags FMF,
......
......@@ -2868,9 +2868,62 @@ int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost;
}
unsigned X86TTIImpl::getScalarizationOverhead(Type *Ty, bool Insert,
bool Extract) {
return BaseT::getScalarizationOverhead(Ty, Insert, Extract);
unsigned X86TTIImpl::getScalarizationOverhead(Type *Ty,
const APInt &DemandedElts,
bool Insert, bool Extract) {
auto* VecTy = cast<VectorType>(Ty);
unsigned Cost = 0;
// For insertions, a ISD::BUILD_VECTOR style vector initialization can be much
// cheaper than an accumulation of ISD::INSERT_VECTOR_ELT.
if (Insert) {
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
MVT MScalarTy = LT.second.getScalarType();
if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
(MScalarTy.isInteger() && ST->hasSSE41()) ||
(MScalarTy == MVT::f32 && ST->hasSSE41())) {
// For types we can insert directly, insertion into 128-bit sub vectors is
// cheap, followed by a cheap chain of concatenations.
if (LT.second.getSizeInBits() <= 128) {
Cost +=
BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, false);
} else {
unsigned NumSubVecs = LT.second.getSizeInBits() / 128;
Cost += (PowerOf2Ceil(NumSubVecs) - 1) * LT.first;
Cost += DemandedElts.countPopulation();
// For vXf32 cases, insertion into the 0'th index in each v4f32
// 128-bit vector is free.
// NOTE: This assumes legalization widens vXf32 vectors.
if (MScalarTy == MVT::f32)
for (unsigned i = 0, e = VecTy->getNumElements(); i < e; i += 4)
if (DemandedElts[i])
Cost--;
}
} else if (LT.second.isVector()) {
// Without fast insertion, we need to use MOVD/MOVQ to pass each demanded
// integer element as a SCALAR_TO_VECTOR, then we build the vector as a
// series of UNPCK followed by CONCAT_VECTORS - all of these can be
// considered cheap.
if (Ty->isIntOrIntVectorTy())
Cost += DemandedElts.countPopulation();
// Get the smaller of the legalized or original pow2-extended number of
// vector elements, which represents the number of unpacks we'll end up
// performing.
unsigned NumElts = LT.second.getVectorNumElements();
unsigned Pow2Elts = PowerOf2Ceil(VecTy->getNumElements());
Cost += (std::min<unsigned>(NumElts, Pow2Elts) - 1) * LT.first;
}
}
// TODO: Use default extraction for now, but we should investigate extending this
// to handle repeated subvector extraction.
if (Extract)
Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, false, Extract);
return Cost;
}
int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
......@@ -2893,9 +2946,11 @@ int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
// Assume that all other non-power-of-two numbers are scalarized.
if (!isPowerOf2_32(NumElem)) {
APInt DemandedElts = APInt::getAllOnesValue(NumElem);
int Cost = BaseT::getMemoryOpCost(Opcode, VTy->getScalarType(), Alignment,
AddressSpace);
int SplitCost = getScalarizationOverhead(Src, Opcode == Instruction::Load,
int SplitCost = getScalarizationOverhead(Src, DemandedElts,
Opcode == Instruction::Load,
Opcode == Instruction::Store);
return NumElem * Cost + SplitCost;
}
......@@ -2935,13 +2990,15 @@ int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
(IsStore && !isLegalMaskedStore(SrcVTy, MaybeAlign(Alignment))) ||
!isPowerOf2_32(NumElem)) {
// Scalarization
int MaskSplitCost = getScalarizationOverhead(MaskTy, false, true);
APInt DemandedElts = APInt::getAllOnesValue(NumElem);
int MaskSplitCost =
getScalarizationOverhead(MaskTy, DemandedElts, false, true);
int ScalarCompareCost = getCmpSelInstrCost(
Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr);
int BranchCost = getCFInstrCost(Instruction::Br);
int MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
int ValueSplitCost = getScalarizationOverhead(SrcVTy, IsLoad, IsStore);
int ValueSplitCost =
getScalarizationOverhead(SrcVTy, DemandedElts, IsLoad, IsStore);
int MemopCost =
NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
MaybeAlign(Alignment), AddressSpace);
......@@ -3795,12 +3852,14 @@ int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,
bool VariableMask, unsigned Alignment,
unsigned AddressSpace) {
unsigned VF = cast<VectorType>(SrcVTy)->getNumElements();
APInt DemandedElts = APInt::getAllOnesValue(VF);
int MaskUnpackCost = 0;
if (VariableMask) {
VectorType *MaskTy =
VectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF);
MaskUnpackCost = getScalarizationOverhead(MaskTy, false, true);
MaskUnpackCost =
getScalarizationOverhead(MaskTy, DemandedElts, false, true);
int ScalarCompareCost =
getCmpSelInstrCost(Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()),
nullptr);
......
......@@ -132,7 +132,8 @@ public:
int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
const Instruction *I = nullptr);
int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract);
unsigned getScalarizationOverhead(Type *Ty, const APInt &DemandedElts,
bool Insert, bool Extract);
int getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,
unsigned AddressSpace, const Instruction *I = nullptr);
int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
......
......@@ -5703,8 +5703,9 @@ int LoopVectorizationCostModel::computePredInstDiscount(
// Compute the scalarization overhead of needed insertelement instructions
// and phi nodes.
if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF),
true, false);
ScalarCost +=
TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF),
APInt::getAllOnesValue(VF), true, false);
ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI);
}
......@@ -5720,7 +5721,8 @@ int LoopVectorizationCostModel::computePredInstDiscount(
Worklist.push_back(J);
else if (needsExtract(J, VF))
ScalarCost += TTI.getScalarizationOverhead(
ToVectorTy(J->getType(),VF), false, true);
ToVectorTy(J->getType(), VF), APInt::getAllOnesValue(VF), false,
true);
}
// Scale the total scalar cost by block probability.
......@@ -6004,7 +6006,8 @@ unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
Type *RetTy = ToVectorTy(I->getType(), VF);
if (!RetTy->isVoidTy() &&
(!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
Cost += TTI.getScalarizationOverhead(RetTy, true, false);
Cost += TTI.getScalarizationOverhead(RetTy, APInt::getAllOnesValue(VF),
true, false);
// Some targets keep addresses scalar.
if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
......@@ -6210,7 +6213,8 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
// Return cost for branches around scalarized and predicated blocks.
Type *Vec_i1Ty =
VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) +
return (TTI.getScalarizationOverhead(Vec_i1Ty, APInt::getAllOnesValue(VF),
false, true) +
(TTI.getCFInstrCost(Instruction::Br) * VF));
} else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1)
// The back-edge branch will remain, as will all scalar branches.
......
......@@ -3877,10 +3877,13 @@ int BoUpSLP::getTreeCost() {
int BoUpSLP::getGatherCost(VectorType *Ty,
const DenseSet<unsigned> &ShuffledIndices) const {
int Cost = 0;
for (unsigned i = 0, e = Ty->getNumElements(); i < e; ++i)
unsigned NumElts = Ty->getNumElements();
APInt DemandedElts = APInt::getNullValue(NumElts);
for (unsigned i = 0; i < NumElts; ++i)
if (!ShuffledIndices.count(i))
Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
DemandedElts.setBit(i);
int Cost = TTI->getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ true,
/*Extract*/ false);
if (!ShuffledIndices.empty())
Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty);
return Cost;
......@@ -7018,6 +7021,34 @@ static bool findBuildAggregate(Value *LastInsertInst, TargetTransformInfo *TTI,
"Expected insertelement or insertvalue instruction!");
UserCost = 0;
do {
// TODO: Use TTI's getScalarizationOverhead for sequence of inserts rather
// than sum of single inserts as the latter may overestimate cost.
// This work should imply improving cost estimation for extracts that
// added in for external (for vectorization tree) users.
// For example, in following case all extracts added in order to feed
// into external users (inserts), which in turn form sequence to build
// an aggregate that we do match here:
// %4 = extractelement <4 x i64> %3, i32 0
// %v0 = insertelement <4 x i64> undef, i64 %4, i32 0
// %5 = extractelement <4 x i64> %3, i32 1
// %v1 = insertelement <4 x i64> %v0, i64 %5, i32 1
// %6 = extractelement <4 x i64> %3, i32 2
// %v2 = insertelement <4 x i64> %v1, i64 %6, i32 2
// %7 = extractelement <4 x i64> %3, i32 3
// %v3 = insertelement <4 x i64> %v2, i64 %7, i32 3
//
// Cost of this entire sequence is currently estimated as sum of single
// extracts (as this aggregate build sequence is an external to
// vectorization tree user) minus cost of the aggregate build.
// As this whole sequence will be optimized away we want the cost to be
// zero. But it is not quite possible using given approach (at least for
// X86) because inserts can be more expensive than extracts for longer
// vector lengths so the difference turns out not zero in such a case.
// Ideally we want to match this entire sequence and treat it as a no-op
// (i.e. do not count into final cost at all).
// Currently the difference tends to be negative thus adding a bias
// toward favoring vectorization. If we switch into using TTI interface
// the bias tendency will remain but will be lower.
Value *InsertedOperand;
if (auto *IE = dyn_cast<InsertElementInst>(LastInsertInst)) {
InsertedOperand = IE->getOperand(1);
......
......@@ -673,9 +673,9 @@ define i32 @fdiv(i32 %arg) {
define i32 @frem(i32 %arg) {
; SSE1-LABEL: 'frem'
; SSE1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = frem float undef, undef
; SSE1-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V4F32 = frem <4 x float> undef, undef
; SSE1-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V8F32 = frem <8 x float> undef, undef
; SSE1-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %V16F32 = frem <16 x float> undef, undef
; SSE1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4F32 = frem <4 x float> undef, undef
; SSE1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8F32 = frem <8 x float> undef, undef
; SSE1-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16F32 = frem <16 x float> undef, undef
; SSE1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = frem double undef, undef
; SSE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = frem <2 x double> undef, undef
; SSE1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = frem <4 x double> undef, undef
......@@ -684,9 +684,9 @@ define i32 @frem(i32 %arg) {
;
; SSE2-LABEL: 'frem'
; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = frem float undef, undef
; SSE2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V4F32 = frem <4 x float> undef, undef
; SSE2-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V8F32 = frem <8 x float> undef, undef
; SSE2-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %V16F32 = frem <16 x float> undef, undef
; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4F32 = frem <4 x float> undef, undef
; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8F32 = frem <8 x float> undef, undef
; SSE2-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16F32 = frem <16 x float> undef, undef
; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = frem double undef, undef
; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = frem <2 x double> undef, undef
; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4F64 = frem <4 x double> undef, undef
......@@ -707,23 +707,23 @@ define i32 @frem(i32 %arg) {
; AVX-LABEL: 'frem'
; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = frem float undef, undef
; AVX-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4F32 = frem <4 x float> undef, undef
; AVX-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8F32 = frem <8 x float> undef, undef
; AVX-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V16F32 = frem <16 x float> undef, undef
; AVX-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V8F32 = frem <8 x float> undef, undef
; AVX-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V16F32 = frem <16 x float> undef, undef
; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = frem double undef, undef
; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = frem <2 x double> undef, undef
; AVX-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4F64 = frem <4 x double> undef, undef
; AVX-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V8F64 = frem <8 x double> undef, undef
; AVX-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4F64 = frem <4 x double> undef, undef
; AVX-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V8F64 = frem <8 x double> undef, undef
; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512-LABEL: 'frem'
; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = frem float undef, undef
; AVX512-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4F32 = frem <4 x float> undef, undef
; AVX512-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8F32 = frem <8 x float> undef, undef
; AVX512-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V16F32 = frem <16 x float> undef, undef
; AVX512-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V8F32 = frem <8 x float> undef, undef
; AVX512-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %V16F32 = frem <16 x float> undef, undef
; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = frem double undef, undef
; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = frem <2 x double> undef, undef
; AVX512-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4F64 = frem <4 x double> undef, undef
; AVX512-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V8F64 = frem <8 x double> undef, undef
; AVX512-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4F64 = frem <4 x double> undef, undef
; AVX512-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V8F64 = frem <8 x double> undef, undef
; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; SLM-LABEL: 'frem'
......@@ -1059,9 +1059,9 @@ define i32 @fcopysign(i32 %arg) {
define i32 @fma(i32 %arg) {
; SSE1-LABEL: 'fma'
; SSE1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F32 = call float @llvm.fma.f32(float undef, float undef, float undef)
; SSE1-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V4F32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
; SSE1-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
; SSE1-NEXT: Cost Model: Found an estimated cost of 184 for instruction: %V16F32 = call <16 x float> @llvm.fma.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef)
; SSE1-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V4F32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
; SSE1-NEXT: Cost Model: Found an estimated cost of 86 for instruction: %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
; SSE1-NEXT: Cost Model: Found an estimated cost of 172 for instruction: %V16F32 = call <16 x float> @llvm.fma.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef)
; SSE1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F64 = call double @llvm.fma.f64(double undef, double undef, double undef)
; SSE1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V2F64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef)
; SSE1-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V4F64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
......@@ -1070,9 +1070,9 @@ define i32 @fma(i32 %arg) {
;
; SSE2-LABEL: 'fma'
; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F32 = call float @llvm.fma.f32(float undef, float undef, float undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V4F32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 184 for instruction: %V16F32 = call <16 x float> @llvm.fma.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V4F32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 86 for instruction: %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 172 for instruction: %V16F32 = call <16 x float> @llvm.fma.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F64 = call double @llvm.fma.f64(double undef, double undef, double undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V2F64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V4F64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
......@@ -1093,12 +1093,12 @@ define i32 @fma(i32 %arg) {
; AVX-LABEL: 'fma'
; AVX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F32 = call float @llvm.fma.f32(float undef, float undef, float undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V4F32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 94 for instruction: %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 188 for instruction: %V16F32 = call <16 x float> @llvm.fma.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 87 for instruction: %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 174 for instruction: %V16F32 = call <16 x float> @llvm.fma.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F64 = call double @llvm.fma.f64(double undef, double undef, double undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V2F64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V4F64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V8F64 = call <8 x double> @llvm.fma.v8f64(<8 x double> undef, <8 x double> undef, <8 x double> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V4F64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 86 for instruction: %V8F64 = call <8 x double> @llvm.fma.v8f64(<8 x double> undef, <8 x double> undef, <8 x double> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512-LABEL: 'fma'
......
......@@ -13,9 +13,9 @@
define i32 @fptosi_double_i64(i32 %arg) {
; SSE2-LABEL: 'fptosi_double_i64'
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi double undef to i64
; SSE2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = fptosi <2 x double> undef to <2 x i64>
; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4I64 = fptosi <4 x double> undef to <4 x i64>
; SSE2-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V8I64 = fptosi <8 x double> undef to <8 x i64>
; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = fptosi <2 x double> undef to <2 x i64>
; SSE2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V4I64 = fptosi <4 x double> undef to <4 x i64>
; SSE2-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %V8I64 = fptosi <8 x double> undef to <8 x i64>
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; SSE42-LABEL: 'fptosi_double_i64'
......@@ -28,15 +28,15 @@ define i32 @fptosi_double_i64(i32 %arg) {
; AVX-LABEL: 'fptosi_double_i64'
; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi double undef to i64
; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptosi <2 x double> undef to <2 x i64>
; AVX-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = fptosi <4 x double> undef to <4 x i64>
; AVX-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %V8I64 = fptosi <8 x double> undef to <8 x i64>
; AVX-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4I64 = fptosi <4 x double> undef to <4 x i64>
; AVX-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V8I64 = fptosi <8 x double> undef to <8 x i64>
; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512F-LABEL: 'fptosi_double_i64'
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi double undef to i64
; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptosi <2 x double> undef to <2 x i64>
; AVX512F-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = fptosi <4 x double> undef to <4 x i64>
; AVX512F-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V8I64 = fptosi <8 x double> undef to <8 x i64>
; AVX512F-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4I64 = fptosi <4 x double> undef to <4 x i64>
; AVX512F-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V8I64 = fptosi <8 x double> undef to <8 x i64>
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512DQ-LABEL: 'fptosi_double_i64'
......@@ -164,10 +164,10 @@ define i32 @fptosi_double_i8(i32 %arg) {
define i32 @fptosi_float_i64(i32 %arg) {
; SSE2-LABEL: 'fptosi_float_i64'
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi float undef to i64
; SSE2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = fptosi <2 x float> undef to <2 x i64>
; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4I64 = fptosi <4 x float> undef to <4 x i64>
; SSE2-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V8I64 = fptosi <8 x float> undef to <8 x i64>
; SSE2-NEXT: Cost Model: Found an estimated cost of 79 for instruction: %V16I64 = fptosi <16 x float> undef to <16 x i64>
; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = fptosi <2 x float> undef to <2 x i64>
; SSE2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V4I64 = fptosi <4 x float> undef to <4 x i64>
; SSE2-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %V8I64 = fptosi <8 x float> undef to <8 x i64>
; SSE2-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %V16I64 = fptosi <16 x float> undef to <16 x i64>
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; SSE42-LABEL: 'fptosi_float_i64'
......@@ -181,17 +181,17 @@ define i32 @fptosi_float_i64(i32 %arg) {
; AVX-LABEL: 'fptosi_float_i64'
; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi float undef to i64
; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptosi <2 x float> undef to <2 x i64>
; AVX-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = fptosi <4 x float> undef to <4 x i64>
; AVX-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %V8I64 = fptosi <8 x float> undef to <8 x i64>
; AVX-NEXT: Cost Model: Found an estimated cost of 75 for instruction: %V16I64 = fptosi <16 x float> undef to <16 x i64>
; AVX-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4I64 = fptosi <4 x float> undef to <4 x i64>
; AVX-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V8I64 = fptosi <8 x float> undef to <8 x i64>
; AVX-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %V16I64 = fptosi <16 x float> undef to <16 x i64>
; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512F-LABEL: 'fptosi_float_i64'
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi float undef to i64
; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptosi <2 x float> undef to <2 x i64>
; AVX512F-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = fptosi <4 x float> undef to <4 x i64>
; AVX512F-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V8I64 = fptosi <8 x float> undef to <8 x i64>
; AVX512F-NEXT: Cost Model: Found an estimated cost of 85 for instruction: %V16I64 = fptosi <16 x float> undef to <16 x i64>
; AVX512F-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4I64 = fptosi <4 x float> undef to <4 x i64>
; AVX512F-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V8I64 = fptosi <8 x float> undef to <8 x i64>
; AVX512F-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %V16I64 = fptosi <16 x float> undef to <16 x i64>
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512DQ-LABEL: 'fptosi_float_i64'
......
......@@ -13,9 +13,9 @@
define i32 @fptoui_double_i64(i32 %arg) {
; SSE2-LABEL: 'fptoui_double_i64'
; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui double undef to i64
; SSE2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V2I64 = fptoui <2 x double> undef to <2 x i64>
; SSE2-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64>
; SSE2-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64>
; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = fptoui <2 x double> undef to <2 x i64>
; SSE2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64>
; SSE2-NEXT: Cost Model: Found an estimated cost of 59 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64>
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; SSE42-LABEL: 'fptoui_double_i64'
......@@ -28,15 +28,15 @@ define i32 @fptoui_double_i64(i32 %arg) {
; AVX-LABEL: 'fptoui_double_i64'
; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui double undef to i64
; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = fptoui <2 x double> undef to <2 x i64>
; AVX-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64>
; AVX-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64>
; AVX-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64>
; AVX-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64>
; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512F-LABEL: 'fptoui_double_i64'
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptoui double undef to i64
; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptoui <2 x double> undef to <2 x i64>
; AVX512F-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64>
; AVX512F-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64>
; AVX512F-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64>
; AVX512F-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64>
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512DQ-LABEL: 'fptoui_double_i64'
......@@ -63,9 +63,9 @@ define i32 @fptoui_double_i64(i32 %arg) {
define i32 @fptoui_double_i32(i32 %arg) {
; SSE2-LABEL: 'fptoui_double_i32'
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui double undef to i32
; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2I32 = fptoui <2 x double> undef to <2 x i32>
; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V4I32 = fptoui <4 x double> undef to <4 x i32>
; SSE2-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V8I32 = fptoui <8 x double> undef to <8 x i32>
; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = fptoui <2 x double> undef to <2 x i32>
; SSE2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V4I32 = fptoui <4 x double> undef to <4 x i32>
; SSE2-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %V8I32 = fptoui <8 x double> undef to <8 x i32>
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; SSE42-LABEL: 'fptoui_double_i32'
......@@ -178,10 +178,10 @@ define i32 @fptoui_double_i8(i32 %arg) {
define i32 @fptoui_float_i64(i32 %arg) {
; SSE2-LABEL: 'fptoui_float_i64'
; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui float undef to i64
; SSE2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V2I64 = fptoui <2 x float> undef to <2 x i64>
; SSE2-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V4I64 = fptoui <4 x float> undef to <4 x i64>
; SSE2-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %V8I64 = fptoui <8 x float> undef to <8 x i64>
; SSE2-NEXT: Cost Model: Found an estimated cost of 127 for instruction: %V16I64 = fptoui <16 x float> undef to <16 x i64>
; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = fptoui <2 x float> undef to <2 x i64>
; SSE2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V4I64 = fptoui <4 x float> undef to <4 x i64>
; SSE2-NEXT: Cost Model: Found an estimated cost of 59 for instruction: %V8I64 = fptoui <8 x float> undef to <8 x i64>
; SSE2-NEXT: Cost Model: Found an estimated cost of 119 for instruction: %V16I64 = fptoui <16 x float> undef to <16 x i64>