Commit f114ef37 authored by Simon Pilgrim's avatar Simon Pilgrim
Browse files

[CostModel][X86] Add generic costs for vXi32 MUL -> v2Xi16 PMADDDW folds

Based off the improved fold in D108522

This should eventually allow us to replace the SLM only cost patterns with generic versions.
parent 9962ebae
Loading
Loading
Loading
Loading
+17 −0
Original line number Diff line number Diff line
@@ -206,6 +206,22 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
  int ISD = TLI->InstructionOpcodeToISD(Opcode);
  assert(ISD && "Invalid opcode");

  if (ISD == ISD::MUL && Args.size() == 2 && LT.second.isVector() &&
      LT.second.getScalarType() == MVT::i32) {
    // Check if the operands can be represented as a smaller datatype.
    bool Op1Signed = false, Op2Signed = false;
    unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
    unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
    unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);

    // If both are representable as i15 and at least one is zero-extended,
    // then we can treat this as PMADDWD which has the same costs
    // as a vXi16 multiply..
    if (OpMinSize <= 15 && (!Op1Signed || !Op2Signed) && !ST->isPMADDWDSlow())
      LT.second =
          MVT::getVectorVT(MVT::i16, 2 * LT.second.getVectorNumElements());
  }

  if ((ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV ||
       ISD == ISD::UREM) &&
      (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
@@ -288,6 +304,7 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
  if (ST->isSLM()) {
    if (Args.size() == 2 && ISD == ISD::MUL && LT.second == MVT::v4i32) {
      // Check if the operands can be shrinked into a smaller datatype.
      // TODO: Merge this into generiic vXi32 MUL patterns above.
      bool Op1Signed = false;
      unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
      bool Op2Signed = false;
+70 −70

File changed.

Preview size limit exceeded, changes collapsed.

+6 −6

File changed.

Preview size limit exceeded, changes collapsed.

+5 −5
Original line number Diff line number Diff line
@@ -36,13 +36,13 @@ for.body: ; preds = %for.body.preheader,
  %mul = mul nsw i32 %conv3, %conv
; sources of the mul is zext\sext from i8
; use pmulhw\pmullw\pshuf
; SLM:  cost of 5 for VF 4 {{.*}} mul nsw i32
; SLM:  cost of 2 for VF 4 {{.*}} mul nsw i32
  %conv4 = zext i8 %1 to i32
  %mul2 = mul nsw i32 %conv4, %conv
  %sum0 = add i32 %mul, %mul2
; sources of the mul is zext\zext from i8
; use pmullw\zext
; SLM:  cost of 3 for VF 4 {{.*}} mul nsw i32
; SLM:  cost of 2 for VF 4 {{.*}} mul nsw i32
  %conv5 = zext i8 %0 to i32
  %mul3 = mul nsw i32 %conv5, %conv4
  %sum1 = add i32 %sum0, %mul3
@@ -53,17 +53,17 @@ for.body: ; preds = %for.body.preheader,
  %sum2 = add i32 %sum1, %mul4
; sources of the mul is sext\250
; use pmulhw\pmullw\pshuf
; SLM:  cost of 5 for VF 4 {{.*}} mul nsw i32
; SLM:  cost of 2 for VF 4 {{.*}} mul nsw i32
  %mul5 = mul nsw i32 250, %conv3
  %sum3 = add i32 %sum2, %mul5
; sources of the mul is zext\-120
; use pmulhw\pmullw\pshuf
; SLM:  cost of 5 for VF 4 {{.*}} mul nsw i32
; SLM:  cost of 2 for VF 4 {{.*}} mul nsw i32
  %mul6 = mul nsw i32 -120, %conv4
  %sum4 = add i32 %sum3, %mul6
; sources of the mul is zext\250
; use pmullw\zext
; SLM:  cost of 3 for VF 4 {{.*}} mul nsw i32
; SLM:  cost of 2 for VF 4 {{.*}} mul nsw i32
  %mul7 = mul nsw i32 250, %conv4
  %sum5 = add i32 %sum4, %mul7
  %add = add i32 %acc.013, 5