[CostModel][X86] Add generic costs for vXi32 MUL -> v2Xi16 PMADDDW folds (f114ef37) · Commits · llvm-doe / llvm-project

llvm/lib/Target/X86/X86TargetTransformInfo.cpp

+17 −0

Original line number	Diff line number	Diff line
		@@ -206,6 +206,22 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
		int ISD = TLI->InstructionOpcodeToISD(Opcode);
		assert(ISD && "Invalid opcode");

		if (ISD == ISD::MUL && Args.size() == 2 && LT.second.isVector() &&
		LT.second.getScalarType() == MVT::i32) {
		// Check if the operands can be represented as a smaller datatype.
		bool Op1Signed = false, Op2Signed = false;
		unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
		unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
		unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);

		// If both are representable as i15 and at least one is zero-extended,
		// then we can treat this as PMADDWD which has the same costs
		// as a vXi16 multiply..
		if (OpMinSize <= 15 && (!Op1Signed \|\| !Op2Signed) && !ST->isPMADDWDSlow())
		LT.second =
		MVT::getVectorVT(MVT::i16, 2 * LT.second.getVectorNumElements());
		}

		if ((ISD == ISD::SDIV \|\| ISD == ISD::SREM \|\| ISD == ISD::UDIV \|\|
		ISD == ISD::UREM) &&
		(Op2Info == TargetTransformInfo::OK_UniformConstantValue \|\|
		@@ -288,6 +304,7 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
		if (ST->isSLM()) {
		if (Args.size() == 2 && ISD == ISD::MUL && LT.second == MVT::v4i32) {
		// Check if the operands can be shrinked into a smaller datatype.
		// TODO: Merge this into generiic vXi32 MUL patterns above.
		bool Op1Signed = false;
		unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
		bool Op2Signed = false;

+70 −70

File changed.

Preview size limit exceeded, changes collapsed.

+6 −6

File changed.

Preview size limit exceeded, changes collapsed.

+5 −5

Original line number	Diff line number	Diff line
		@@ -36,13 +36,13 @@ for.body: ; preds = %for.body.preheader,
		%mul = mul nsw i32 %conv3, %conv
		; sources of the mul is zext\sext from i8
		; use pmulhw\pmullw\pshuf
		; SLM: cost of 5 for VF 4 {{.*}} mul nsw i32
		; SLM: cost of 2 for VF 4 {{.*}} mul nsw i32
		%conv4 = zext i8 %1 to i32
		%mul2 = mul nsw i32 %conv4, %conv
		%sum0 = add i32 %mul, %mul2
		; sources of the mul is zext\zext from i8
		; use pmullw\zext
		; SLM: cost of 3 for VF 4 {{.*}} mul nsw i32
		; SLM: cost of 2 for VF 4 {{.*}} mul nsw i32
		%conv5 = zext i8 %0 to i32
		%mul3 = mul nsw i32 %conv5, %conv4
		%sum1 = add i32 %sum0, %mul3
		@@ -53,17 +53,17 @@ for.body: ; preds = %for.body.preheader,
		%sum2 = add i32 %sum1, %mul4
		; sources of the mul is sext\250
		; use pmulhw\pmullw\pshuf
		; SLM: cost of 5 for VF 4 {{.*}} mul nsw i32
		; SLM: cost of 2 for VF 4 {{.*}} mul nsw i32
		%mul5 = mul nsw i32 250, %conv3
		%sum3 = add i32 %sum2, %mul5
		; sources of the mul is zext\-120
		; use pmulhw\pmullw\pshuf
		; SLM: cost of 5 for VF 4 {{.*}} mul nsw i32
		; SLM: cost of 2 for VF 4 {{.*}} mul nsw i32
		%mul6 = mul nsw i32 -120, %conv4
		%sum4 = add i32 %sum3, %mul6
		; sources of the mul is zext\250
		; use pmullw\zext
		; SLM: cost of 3 for VF 4 {{.*}} mul nsw i32
		; SLM: cost of 2 for VF 4 {{.*}} mul nsw i32
		%mul7 = mul nsw i32 250, %conv4
		%sum5 = add i32 %sum4, %mul7
		%add = add i32 %acc.013, 5