Commit f2e7de81 authored by Sanne Wouda's avatar Sanne Wouda
Browse files

[AArch64] Fix over-eager fusing of NEON SIMD MUL/ADD

Summary:
The ISel pattern for SIMD MLA is a bit too eager: it replaces the ADD with an
MLA even when the MUL cannot be eliminated, e.g. when it has another use.  An
MLA is usually has a higher latency than an ADD (and there are fewer pipes
available that can execute it), so trading an MLA for an ADD is not great.

ISel is not taking the number of uses of the MUL result into account, nor any
other factors such as the length of the critical path or other resource pressure.

The MachineCombiner is able to make these judgments so this patch ports the ISel
pattern for MUL/ADD fusing to the MachineCombiner.

Similarly for MUL/SUB -> MLS, as well as the indexed variants.

The change has no impact on SPEC CPU© intrate nor fprate.

Reviewers: dmgreen, SjoerdMeijer, fhahn, Gerolf

Subscribers: kristof.beyls, hiraditya, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D70673
parent b4980f77
Loading
Loading
Loading
Loading
+45 −0
Original line number Diff line number Diff line
@@ -38,6 +38,51 @@ enum class MachineCombinerPattern {
  MULSUBX_OP2,
  MULADDXI_OP1,
  MULSUBXI_OP1,
  // NEON integers vectors
  MULADDv8i8_OP1,
  MULADDv8i8_OP2,
  MULADDv16i8_OP1,
  MULADDv16i8_OP2,
  MULADDv4i16_OP1,
  MULADDv4i16_OP2,
  MULADDv8i16_OP1,
  MULADDv8i16_OP2,
  MULADDv2i32_OP1,
  MULADDv2i32_OP2,
  MULADDv4i32_OP1,
  MULADDv4i32_OP2,

  MULSUBv8i8_OP1,
  MULSUBv8i8_OP2,
  MULSUBv16i8_OP1,
  MULSUBv16i8_OP2,
  MULSUBv4i16_OP1,
  MULSUBv4i16_OP2,
  MULSUBv8i16_OP1,
  MULSUBv8i16_OP2,
  MULSUBv2i32_OP1,
  MULSUBv2i32_OP2,
  MULSUBv4i32_OP1,
  MULSUBv4i32_OP2,

  MULADDv4i16_indexed_OP1,
  MULADDv4i16_indexed_OP2,
  MULADDv8i16_indexed_OP1,
  MULADDv8i16_indexed_OP2,
  MULADDv2i32_indexed_OP1,
  MULADDv2i32_indexed_OP2,
  MULADDv4i32_indexed_OP1,
  MULADDv4i32_indexed_OP2,

  MULSUBv4i16_indexed_OP1,
  MULSUBv4i16_indexed_OP2,
  MULSUBv8i16_indexed_OP1,
  MULSUBv8i16_indexed_OP2,
  MULSUBv2i32_indexed_OP1,
  MULSUBv2i32_indexed_OP2,
  MULSUBv4i32_indexed_OP1,
  MULSUBv4i32_indexed_OP2,

  // Floating Point
  FMULADDH_OP1,
  FMULADDH_OP2,
+352 −0
Original line number Diff line number Diff line
@@ -3571,6 +3571,18 @@ static bool isCombineInstrCandidate64(unsigned Opc) {
  // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
  case AArch64::SUBXri:
  case AArch64::SUBSXri:
  case AArch64::ADDv8i8:
  case AArch64::ADDv16i8:
  case AArch64::ADDv4i16:
  case AArch64::ADDv8i16:
  case AArch64::ADDv2i32:
  case AArch64::ADDv4i32:
  case AArch64::SUBv8i8:
  case AArch64::SUBv16i8:
  case AArch64::SUBv4i16:
  case AArch64::SUBv8i16:
  case AArch64::SUBv2i32:
  case AArch64::SUBv4i32:
    return true;
  default:
    break;
@@ -3713,6 +3725,13 @@ static bool getMaddPatterns(MachineInstr &Root,
    }
  };

  auto setVFound = [&](int Opcode, int Operand, MachineCombinerPattern Pattern) {
    if (canCombine(MBB, Root.getOperand(Operand), Opcode)) {
      Patterns.push_back(Pattern);
      Found = true;
    }
  };

  typedef MachineCombinerPattern MCP;

  switch (Opc) {
@@ -3748,6 +3767,70 @@ static bool getMaddPatterns(MachineInstr &Root,
  case AArch64::SUBXri:
    setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBXI_OP1);
    break;
  case AArch64::ADDv8i8:
    setVFound(AArch64::MULv8i8, 1, MCP::MULADDv8i8_OP1);
    setVFound(AArch64::MULv8i8, 2, MCP::MULADDv8i8_OP2);
    break;
  case AArch64::ADDv16i8:
    setVFound(AArch64::MULv16i8, 1, MCP::MULADDv16i8_OP1);
    setVFound(AArch64::MULv16i8, 2, MCP::MULADDv16i8_OP2);
    break;
  case AArch64::ADDv4i16:
    setVFound(AArch64::MULv4i16, 1, MCP::MULADDv4i16_OP1);
    setVFound(AArch64::MULv4i16, 2, MCP::MULADDv4i16_OP2);
    setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULADDv4i16_indexed_OP1);
    setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULADDv4i16_indexed_OP2);
    break;
  case AArch64::ADDv8i16:
    setVFound(AArch64::MULv8i16, 1, MCP::MULADDv8i16_OP1);
    setVFound(AArch64::MULv8i16, 2, MCP::MULADDv8i16_OP2);
    setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULADDv8i16_indexed_OP1);
    setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULADDv8i16_indexed_OP2);
    break;
  case AArch64::ADDv2i32:
    setVFound(AArch64::MULv2i32, 1, MCP::MULADDv2i32_OP1);
    setVFound(AArch64::MULv2i32, 2, MCP::MULADDv2i32_OP2);
    setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULADDv2i32_indexed_OP1);
    setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULADDv2i32_indexed_OP2);
    break;
  case AArch64::ADDv4i32:
    setVFound(AArch64::MULv4i32, 1, MCP::MULADDv4i32_OP1);
    setVFound(AArch64::MULv4i32, 2, MCP::MULADDv4i32_OP2);
    setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULADDv4i32_indexed_OP1);
    setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULADDv4i32_indexed_OP2);
    break;
  case AArch64::SUBv8i8:
    setVFound(AArch64::MULv8i8, 1, MCP::MULSUBv8i8_OP1);
    setVFound(AArch64::MULv8i8, 2, MCP::MULSUBv8i8_OP2);
    break;
  case AArch64::SUBv16i8:
    setVFound(AArch64::MULv16i8, 1, MCP::MULSUBv16i8_OP1);
    setVFound(AArch64::MULv16i8, 2, MCP::MULSUBv16i8_OP2);
    break;
  case AArch64::SUBv4i16:
    setVFound(AArch64::MULv4i16, 1, MCP::MULSUBv4i16_OP1);
    setVFound(AArch64::MULv4i16, 2, MCP::MULSUBv4i16_OP2);
    setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULSUBv4i16_indexed_OP1);
    setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULSUBv4i16_indexed_OP2);
    break;
  case AArch64::SUBv8i16:
    setVFound(AArch64::MULv8i16, 1, MCP::MULSUBv8i16_OP1);
    setVFound(AArch64::MULv8i16, 2, MCP::MULSUBv8i16_OP2);
    setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULSUBv8i16_indexed_OP1);
    setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULSUBv8i16_indexed_OP2);
    break;
  case AArch64::SUBv2i32:
    setVFound(AArch64::MULv2i32, 1, MCP::MULSUBv2i32_OP1);
    setVFound(AArch64::MULv2i32, 2, MCP::MULSUBv2i32_OP2);
    setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULSUBv2i32_indexed_OP1);
    setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULSUBv2i32_indexed_OP2);
    break;
  case AArch64::SUBv4i32:
    setVFound(AArch64::MULv4i32, 1, MCP::MULSUBv4i32_OP1);
    setVFound(AArch64::MULv4i32, 2, MCP::MULSUBv4i32_OP2);
    setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULSUBv4i32_indexed_OP1);
    setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULSUBv4i32_indexed_OP2);
    break;
  }
  return Found;
}
@@ -3960,6 +4043,46 @@ bool AArch64InstrInfo::isThroughputPattern(
  case MachineCombinerPattern::FMLSv2f64_OP2:
  case MachineCombinerPattern::FMLSv4i32_indexed_OP2:
  case MachineCombinerPattern::FMLSv4f32_OP2:
  case MachineCombinerPattern::MULADDv8i8_OP1:
  case MachineCombinerPattern::MULADDv8i8_OP2:
  case MachineCombinerPattern::MULADDv16i8_OP1:
  case MachineCombinerPattern::MULADDv16i8_OP2:
  case MachineCombinerPattern::MULADDv4i16_OP1:
  case MachineCombinerPattern::MULADDv4i16_OP2:
  case MachineCombinerPattern::MULADDv8i16_OP1:
  case MachineCombinerPattern::MULADDv8i16_OP2:
  case MachineCombinerPattern::MULADDv2i32_OP1:
  case MachineCombinerPattern::MULADDv2i32_OP2:
  case MachineCombinerPattern::MULADDv4i32_OP1:
  case MachineCombinerPattern::MULADDv4i32_OP2:
  case MachineCombinerPattern::MULSUBv8i8_OP1:
  case MachineCombinerPattern::MULSUBv8i8_OP2:
  case MachineCombinerPattern::MULSUBv16i8_OP1:
  case MachineCombinerPattern::MULSUBv16i8_OP2:
  case MachineCombinerPattern::MULSUBv4i16_OP1:
  case MachineCombinerPattern::MULSUBv4i16_OP2:
  case MachineCombinerPattern::MULSUBv8i16_OP1:
  case MachineCombinerPattern::MULSUBv8i16_OP2:
  case MachineCombinerPattern::MULSUBv2i32_OP1:
  case MachineCombinerPattern::MULSUBv2i32_OP2:
  case MachineCombinerPattern::MULSUBv4i32_OP1:
  case MachineCombinerPattern::MULSUBv4i32_OP2:
  case MachineCombinerPattern::MULADDv4i16_indexed_OP1:
  case MachineCombinerPattern::MULADDv4i16_indexed_OP2:
  case MachineCombinerPattern::MULADDv8i16_indexed_OP1:
  case MachineCombinerPattern::MULADDv8i16_indexed_OP2:
  case MachineCombinerPattern::MULADDv2i32_indexed_OP1:
  case MachineCombinerPattern::MULADDv2i32_indexed_OP2:
  case MachineCombinerPattern::MULADDv4i32_indexed_OP1:
  case MachineCombinerPattern::MULADDv4i32_indexed_OP2:
  case MachineCombinerPattern::MULSUBv4i16_indexed_OP1:
  case MachineCombinerPattern::MULSUBv4i16_indexed_OP2:
  case MachineCombinerPattern::MULSUBv8i16_indexed_OP1:
  case MachineCombinerPattern::MULSUBv8i16_indexed_OP2:
  case MachineCombinerPattern::MULSUBv2i32_indexed_OP1:
  case MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
  case MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
  case MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
    return true;
  } // end switch (Pattern)
  return false;
@@ -4063,6 +4186,30 @@ genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI,
  return MUL;
}

/// genFusedMultiplyAcc - Helper to generate fused multiply accumulate
/// instructions.
///
/// \see genFusedMultiply
static MachineInstr *genFusedMultiplyAcc(
    MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
    MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
    unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
  return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
                          FMAInstKind::Accumulator);
}

/// genFusedMultiplyIdx - Helper to generate fused multiply accumulate
/// instructions.
///
/// \see genFusedMultiply
static MachineInstr *genFusedMultiplyIdx(
    MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
    MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
    unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
  return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
                          FMAInstKind::Indexed);
}

/// genMaddR - Generate madd instruction and combine mul and add using
/// an extra virtual register
/// Example - an ADD intermediate needs to be stored in a register:
@@ -4302,6 +4449,211 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
    }
    break;
  }

  case MachineCombinerPattern::MULADDv8i8_OP1:
    Opc = AArch64::MLAv8i8;
    RC = &AArch64::FPR64RegClass;
    MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
    break;
  case MachineCombinerPattern::MULADDv8i8_OP2:
    Opc = AArch64::MLAv8i8;
    RC = &AArch64::FPR64RegClass;
    MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
    break;
  case MachineCombinerPattern::MULADDv16i8_OP1:
    Opc = AArch64::MLAv16i8;
    RC = &AArch64::FPR128RegClass;
    MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
    break;
  case MachineCombinerPattern::MULADDv16i8_OP2:
    Opc = AArch64::MLAv16i8;
    RC = &AArch64::FPR128RegClass;
    MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
    break;
  case MachineCombinerPattern::MULADDv4i16_OP1:
    Opc = AArch64::MLAv4i16;
    RC = &AArch64::FPR64RegClass;
    MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
    break;
  case MachineCombinerPattern::MULADDv4i16_OP2:
    Opc = AArch64::MLAv4i16;
    RC = &AArch64::FPR64RegClass;
    MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
    break;
  case MachineCombinerPattern::MULADDv8i16_OP1:
    Opc = AArch64::MLAv8i16;
    RC = &AArch64::FPR128RegClass;
    MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
    break;
  case MachineCombinerPattern::MULADDv8i16_OP2:
    Opc = AArch64::MLAv8i16;
    RC = &AArch64::FPR128RegClass;
    MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
    break;
  case MachineCombinerPattern::MULADDv2i32_OP1:
    Opc = AArch64::MLAv2i32;
    RC = &AArch64::FPR64RegClass;
    MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
    break;
  case MachineCombinerPattern::MULADDv2i32_OP2:
    Opc = AArch64::MLAv2i32;
    RC = &AArch64::FPR64RegClass;
    MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
    break;
  case MachineCombinerPattern::MULADDv4i32_OP1:
    Opc = AArch64::MLAv4i32;
    RC = &AArch64::FPR128RegClass;
    MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
    break;
  case MachineCombinerPattern::MULADDv4i32_OP2:
    Opc = AArch64::MLAv4i32;
    RC = &AArch64::FPR128RegClass;
    MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
    break;

  case MachineCombinerPattern::MULSUBv8i8_OP1:
    Opc = AArch64::MLSv8i8;
    RC = &AArch64::FPR64RegClass;
    MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
    break;
  case MachineCombinerPattern::MULSUBv8i8_OP2:
    Opc = AArch64::MLSv8i8;
    RC = &AArch64::FPR64RegClass;
    MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
    break;
  case MachineCombinerPattern::MULSUBv16i8_OP1:
    Opc = AArch64::MLSv16i8;
    RC = &AArch64::FPR128RegClass;
    MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
    break;
  case MachineCombinerPattern::MULSUBv16i8_OP2:
    Opc = AArch64::MLSv16i8;
    RC = &AArch64::FPR128RegClass;
    MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
    break;
  case MachineCombinerPattern::MULSUBv4i16_OP1:
    Opc = AArch64::MLSv4i16;
    RC = &AArch64::FPR64RegClass;
    MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
    break;
  case MachineCombinerPattern::MULSUBv4i16_OP2:
    Opc = AArch64::MLSv4i16;
    RC = &AArch64::FPR64RegClass;
    MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
    break;
  case MachineCombinerPattern::MULSUBv8i16_OP1:
    Opc = AArch64::MLSv8i16;
    RC = &AArch64::FPR128RegClass;
    MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
    break;
  case MachineCombinerPattern::MULSUBv8i16_OP2:
    Opc = AArch64::MLSv8i16;
    RC = &AArch64::FPR128RegClass;
    MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
    break;
  case MachineCombinerPattern::MULSUBv2i32_OP1:
    Opc = AArch64::MLSv2i32;
    RC = &AArch64::FPR64RegClass;
    MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
    break;
  case MachineCombinerPattern::MULSUBv2i32_OP2:
    Opc = AArch64::MLSv2i32;
    RC = &AArch64::FPR64RegClass;
    MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
    break;
  case MachineCombinerPattern::MULSUBv4i32_OP1:
    Opc = AArch64::MLSv4i32;
    RC = &AArch64::FPR128RegClass;
    MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
    break;
  case MachineCombinerPattern::MULSUBv4i32_OP2:
    Opc = AArch64::MLSv4i32;
    RC = &AArch64::FPR128RegClass;
    MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
    break;

  case MachineCombinerPattern::MULADDv4i16_indexed_OP1:
    Opc = AArch64::MLAv4i16_indexed;
    RC = &AArch64::FPR64RegClass;
    MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
    break;
  case MachineCombinerPattern::MULADDv4i16_indexed_OP2:
    Opc = AArch64::MLAv4i16_indexed;
    RC = &AArch64::FPR64RegClass;
    MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
    break;
  case MachineCombinerPattern::MULADDv8i16_indexed_OP1:
    Opc = AArch64::MLAv8i16_indexed;
    RC = &AArch64::FPR128RegClass;
    MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
    break;
  case MachineCombinerPattern::MULADDv8i16_indexed_OP2:
    Opc = AArch64::MLAv8i16_indexed;
    RC = &AArch64::FPR128RegClass;
    MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
    break;
  case MachineCombinerPattern::MULADDv2i32_indexed_OP1:
    Opc = AArch64::MLAv2i32_indexed;
    RC = &AArch64::FPR64RegClass;
    MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
    break;
  case MachineCombinerPattern::MULADDv2i32_indexed_OP2:
    Opc = AArch64::MLAv2i32_indexed;
    RC = &AArch64::FPR64RegClass;
    MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
    break;
  case MachineCombinerPattern::MULADDv4i32_indexed_OP1:
    Opc = AArch64::MLAv4i32_indexed;
    RC = &AArch64::FPR128RegClass;
    MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
    break;
  case MachineCombinerPattern::MULADDv4i32_indexed_OP2:
    Opc = AArch64::MLAv4i32_indexed;
    RC = &AArch64::FPR128RegClass;
    MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
    break;

  case MachineCombinerPattern::MULSUBv4i16_indexed_OP1:
    Opc = AArch64::MLSv4i16_indexed;
    RC = &AArch64::FPR64RegClass;
    MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
    break;
  case MachineCombinerPattern::MULSUBv4i16_indexed_OP2:
    Opc = AArch64::MLSv4i16_indexed;
    RC = &AArch64::FPR64RegClass;
    MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
    break;
  case MachineCombinerPattern::MULSUBv8i16_indexed_OP1:
    Opc = AArch64::MLSv8i16_indexed;
    RC = &AArch64::FPR128RegClass;
    MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
    break;
  case MachineCombinerPattern::MULSUBv8i16_indexed_OP2:
    Opc = AArch64::MLSv8i16_indexed;
    RC = &AArch64::FPR128RegClass;
    MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
    break;
  case MachineCombinerPattern::MULSUBv2i32_indexed_OP1:
    Opc = AArch64::MLSv2i32_indexed;
    RC = &AArch64::FPR64RegClass;
    MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
    break;
  case MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
    Opc = AArch64::MLSv2i32_indexed;
    RC = &AArch64::FPR64RegClass;
    MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
    break;
  case MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
    Opc = AArch64::MLSv4i32_indexed;
    RC = &AArch64::FPR128RegClass;
    MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
    break;
  case MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
    Opc = AArch64::MLSv4i32_indexed;
    RC = &AArch64::FPR128RegClass;
    MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
    break;

  // Floating Point Support
  case MachineCombinerPattern::FMULADDH_OP1:
    Opc = AArch64::FMADDHrrr;
+10 −8
Original line number Diff line number Diff line
@@ -3824,10 +3824,11 @@ defm FMUL : SIMDThreeSameVectorFP<1,0,0b011,"fmul", fmul>;
defm FRECPS   : SIMDThreeSameVectorFP<0,0,0b111,"frecps", int_aarch64_neon_frecps>;
defm FRSQRTS  : SIMDThreeSameVectorFP<0,1,0b111,"frsqrts", int_aarch64_neon_frsqrts>;
defm FSUB     : SIMDThreeSameVectorFP<0,1,0b010,"fsub", fsub>;
defm MLA      : SIMDThreeSameVectorBHSTied<0, 0b10010, "mla",
                      TriOpFrag<(add node:$LHS, (mul node:$MHS, node:$RHS))> >;
defm MLS      : SIMDThreeSameVectorBHSTied<1, 0b10010, "mls",
                      TriOpFrag<(sub node:$LHS, (mul node:$MHS, node:$RHS))> >;

// MLA and MLS are generated in MachineCombine
defm MLA      : SIMDThreeSameVectorBHSTied<0, 0b10010, "mla", null_frag>;
defm MLS      : SIMDThreeSameVectorBHSTied<1, 0b10010, "mls", null_frag>;

defm MUL      : SIMDThreeSameVectorBHS<0, 0b10011, "mul", mul>;
defm PMUL     : SIMDThreeSameVectorB<1, 0b10011, "pmul", int_aarch64_neon_pmul>;
defm SABA     : SIMDThreeSameVectorBHSTied<0, 0b01111, "saba",
@@ -5557,10 +5558,11 @@ def : Pat<(v2f64 (fmul V128:$Rn, (AArch64dup (f64 FPR64:$Rm)))),

defm SQDMULH : SIMDIndexedHS<0, 0b1100, "sqdmulh", int_aarch64_neon_sqdmulh>;
defm SQRDMULH : SIMDIndexedHS<0, 0b1101, "sqrdmulh", int_aarch64_neon_sqrdmulh>;
defm MLA   : SIMDVectorIndexedHSTied<1, 0b0000, "mla",
              TriOpFrag<(add node:$LHS, (mul node:$MHS, node:$RHS))>>;
defm MLS   : SIMDVectorIndexedHSTied<1, 0b0100, "mls",
              TriOpFrag<(sub node:$LHS, (mul node:$MHS, node:$RHS))>>;

// Generated by MachineCombine
defm MLA   : SIMDVectorIndexedHSTied<1, 0b0000, "mla", null_frag>;
defm MLS   : SIMDVectorIndexedHSTied<1, 0b0100, "mls", null_frag>;

defm MUL   : SIMDVectorIndexedHS<0, 0b1000, "mul", mul>;
defm SMLAL : SIMDVectorIndexedLongSDTied<0, 0b0010, "smlal",
    TriOpFrag<(add node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
+36 −24

File changed.

Preview size limit exceeded, changes collapsed.

+7 −7
Original line number Diff line number Diff line
@@ -5,17 +5,17 @@ define dso_local void @jsimd_idct_ifast_neon_intrinsic(i8* nocapture readonly %d
; CHECK-LABEL: jsimd_idct_ifast_neon_intrinsic:
; CHECK:       // %bb.0: // %entry
; CHECK-NEXT:    ldr q0, [x1, #32]
; CHECK-NEXT:    ldr q1, [x0, #32]
; CHECK-NEXT:    ldr q2, [x1, #96]
; CHECK-NEXT:    ldr q1, [x1, #96]
; CHECK-NEXT:    ldr q2, [x0, #32]
; CHECK-NEXT:    ldr q3, [x0, #96]
; CHECK-NEXT:    ldr x8, [x2, #48]
; CHECK-NEXT:    mul v0.8h, v1.8h, v0.8h
; CHECK-NEXT:    mov v1.16b, v0.16b
; CHECK-NEXT:    mla v1.8h, v3.8h, v2.8h
; CHECK-NEXT:    mov w9, w3
; CHECK-NEXT:    str q1, [x8, x9]
; CHECK-NEXT:    mul v0.8h, v2.8h, v0.8h
; CHECK-NEXT:    mul v1.8h, v3.8h, v1.8h
; CHECK-NEXT:    add v2.8h, v0.8h, v1.8h
; CHECK-NEXT:    str q2, [x8, x9]
; CHECK-NEXT:    ldr x8, [x2, #56]
; CHECK-NEXT:    mls v0.8h, v3.8h, v2.8h
; CHECK-NEXT:    sub v0.8h, v0.8h, v1.8h
; CHECK-NEXT:    str q0, [x8, x9]
; CHECK-NEXT:    ret
entry: