[AMDGPU] Improve code size cost model (part 2) (47a5c36b) · Commits · llvm-doe / llvm-project

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

+98 −18

Original line number	Diff line number	Diff line
		@@ -695,26 +695,27 @@ void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,

		unsigned GCNTTIImpl::getUserCost(const User *U,
		ArrayRef<const Value *> Operands) {
		// Estimate extractelement elimination
		if (const ExtractElementInst *EE = dyn_cast<ExtractElementInst>(U)) {
		ConstantInt *CI = dyn_cast<ConstantInt>(EE->getOperand(1));
		const Instruction *I = dyn_cast<Instruction>(U);
		if (!I)
		return BaseT::getUserCost(U, Operands);

		// Estimate different operations to be optimized out
		switch (I->getOpcode()) {
		case Instruction::ExtractElement: {
		ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1));
		unsigned Idx = -1;
		if (CI)
		Idx = CI->getZExtValue();
		return getVectorInstrCost(EE->getOpcode(), EE->getOperand(0)->getType(),
		Idx);
		return getVectorInstrCost(I->getOpcode(), I->getOperand(0)->getType(), Idx);
		}

		// Estimate insertelement elimination
		if (const InsertElementInst *IE = dyn_cast<InsertElementInst>(U)) {
		ConstantInt *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
		case Instruction::InsertElement: {
		ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(2));
		unsigned Idx = -1;
		if (CI)
		Idx = CI->getZExtValue();
		return getVectorInstrCost(IE->getOpcode(), IE->getType(), Idx);
		return getVectorInstrCost(I->getOpcode(), I->getType(), Idx);
		}

		// Estimate different intrinsics, e.g. llvm.fabs
		case Instruction::Call: {
		if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(U)) {
		SmallVector<Value *, 4> Args(II->arg_operands());
		FastMathFlags FMF;
		@@ -722,7 +723,86 @@ unsigned GCNTTIImpl::getUserCost(const User *U,
		FMF = FPMO->getFastMathFlags();
		return getIntrinsicInstrCost(II->getIntrinsicID(), II->getType(), Args,
		FMF);
		} else {
		return BaseT::getUserCost(U, Operands);
		}
		}
		case Instruction::ShuffleVector: {
		const ShuffleVectorInst *Shuffle = cast<ShuffleVectorInst>(I);
		Type *Ty = Shuffle->getType();
		Type *SrcTy = Shuffle->getOperand(0)->getType();

		// TODO: Identify and add costs for insert subvector, etc.
		int SubIndex;
		if (Shuffle->isExtractSubvectorMask(SubIndex))
		return getShuffleCost(TTI::SK_ExtractSubvector, SrcTy, SubIndex, Ty);

		if (Shuffle->changesLength())
		return -1;

		if (Shuffle->isIdentity())
		return 0;

		if (Shuffle->isReverse())
		return getShuffleCost(TTI::SK_Reverse, Ty, 0, nullptr);

		if (Shuffle->isSelect())
		return getShuffleCost(TTI::SK_Select, Ty, 0, nullptr);

		if (Shuffle->isTranspose())
		return getShuffleCost(TTI::SK_Transpose, Ty, 0, nullptr);

		if (Shuffle->isZeroEltSplat())
		return getShuffleCost(TTI::SK_Broadcast, Ty, 0, nullptr);

		if (Shuffle->isSingleSource())
		return getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, 0, nullptr);

		return getShuffleCost(TTI::SK_PermuteTwoSrc, Ty, 0, nullptr);
		}
		case Instruction::ZExt:
		case Instruction::SExt:
		case Instruction::FPToUI:
		case Instruction::FPToSI:
		case Instruction::FPExt:
		case Instruction::PtrToInt:
		case Instruction::IntToPtr:
		case Instruction::SIToFP:
		case Instruction::UIToFP:
		case Instruction::Trunc:
		case Instruction::FPTrunc:
		case Instruction::BitCast:
		case Instruction::AddrSpaceCast: {
		return getCastInstrCost(I->getOpcode(), I->getType(),
		I->getOperand(0)->getType(), I);
		}
		case Instruction::Add:
		case Instruction::FAdd:
		case Instruction::Sub:
		case Instruction::FSub:
		case Instruction::Mul:
		case Instruction::FMul:
		case Instruction::UDiv:
		case Instruction::SDiv:
		case Instruction::FDiv:
		case Instruction::URem:
		case Instruction::SRem:
		case Instruction::FRem:
		case Instruction::Shl:
		case Instruction::LShr:
		case Instruction::AShr:
		case Instruction::And:
		case Instruction::Or:
		case Instruction::Xor:
		case Instruction::FNeg: {
		return getArithmeticInstrCost(I->getOpcode(), I->getType(),
		TTI::OK_AnyValue, TTI::OK_AnyValue,
		TTI::OP_None, TTI::OP_None, Operands);
		}
		default:
		break;
		}

		return BaseT::getUserCost(U, Operands);
		}

llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll

+3 −0

Original line number	Diff line number	Diff line
		; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=+half-rate-64-ops < %s \| FileCheck %s
		; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s \| FileCheck %s
		; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=+half-rate-64-ops < %s \| FileCheck %s
		; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s \| FileCheck %s


		; CHECK: 'add_i32'
		; CHECK: estimated cost of 1 for {{.*}} add i32

llvm/test/Analysis/CostModel/AMDGPU/addrspacecast.ll

+1 −0

Original line number	Diff line number	Diff line
		; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri < %s \| FileCheck %s
		; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri < %s \| FileCheck %s

		; CHECK-LABEL: 'addrspacecast_global_to_flat'
		; CHECK: estimated cost of 0 for {{.}} addrspacecast i8 addrspace(1) %ptr to i8*

llvm/test/Analysis/CostModel/AMDGPU/bit-ops.ll

+1 −0

Original line number	Diff line number	Diff line
		; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa < %s \| FileCheck %s
		; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa < %s \| FileCheck %s

		; CHECK: 'or_i32'
		; CHECK: estimated cost of 1 for {{.*}} or i32

llvm/test/Analysis/CostModel/AMDGPU/fadd.ll

+2 −0

Original line number	Diff line number	Diff line
		; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=+half-rate-64-ops < %s \| FileCheck -check-prefix=FASTF64 -check-prefix=ALL %s
		; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s \| FileCheck -check-prefix=SLOWF64 -check-prefix=ALL %s
		; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=+half-rate-64-ops < %s \| FileCheck -check-prefix=FASTF64 -check-prefix=ALL %s
		; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s \| FileCheck -check-prefix=SLOWF64 -check-prefix=ALL %s

		; ALL: 'fadd_f32'
		; ALL: estimated cost of 1 for {{.*}} fadd float