[AArch64] Fix over-eager fusing of NEON SIMD MUL/ADD (f2e7de81) · Commits · llvm-doe / llvm-project

llvm/include/llvm/CodeGen/MachineCombinerPattern.h

+45 −0

Original line number	Diff line number	Diff line
		@@ -38,6 +38,51 @@ enum class MachineCombinerPattern {
		MULSUBX_OP2,
		MULADDXI_OP1,
		MULSUBXI_OP1,
		// NEON integers vectors
		MULADDv8i8_OP1,
		MULADDv8i8_OP2,
		MULADDv16i8_OP1,
		MULADDv16i8_OP2,
		MULADDv4i16_OP1,
		MULADDv4i16_OP2,
		MULADDv8i16_OP1,
		MULADDv8i16_OP2,
		MULADDv2i32_OP1,
		MULADDv2i32_OP2,
		MULADDv4i32_OP1,
		MULADDv4i32_OP2,

		MULSUBv8i8_OP1,
		MULSUBv8i8_OP2,
		MULSUBv16i8_OP1,
		MULSUBv16i8_OP2,
		MULSUBv4i16_OP1,
		MULSUBv4i16_OP2,
		MULSUBv8i16_OP1,
		MULSUBv8i16_OP2,
		MULSUBv2i32_OP1,
		MULSUBv2i32_OP2,
		MULSUBv4i32_OP1,
		MULSUBv4i32_OP2,

		MULADDv4i16_indexed_OP1,
		MULADDv4i16_indexed_OP2,
		MULADDv8i16_indexed_OP1,
		MULADDv8i16_indexed_OP2,
		MULADDv2i32_indexed_OP1,
		MULADDv2i32_indexed_OP2,
		MULADDv4i32_indexed_OP1,
		MULADDv4i32_indexed_OP2,

		MULSUBv4i16_indexed_OP1,
		MULSUBv4i16_indexed_OP2,
		MULSUBv8i16_indexed_OP1,
		MULSUBv8i16_indexed_OP2,
		MULSUBv2i32_indexed_OP1,
		MULSUBv2i32_indexed_OP2,
		MULSUBv4i32_indexed_OP1,
		MULSUBv4i32_indexed_OP2,

		// Floating Point
		FMULADDH_OP1,
		FMULADDH_OP2,

llvm/lib/Target/AArch64/AArch64InstrInfo.cpp

+352 −0

Original line number	Diff line number	Diff line
		@@ -3571,6 +3571,18 @@ static bool isCombineInstrCandidate64(unsigned Opc) {
		// Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
		case AArch64::SUBXri:
		case AArch64::SUBSXri:
		case AArch64::ADDv8i8:
		case AArch64::ADDv16i8:
		case AArch64::ADDv4i16:
		case AArch64::ADDv8i16:
		case AArch64::ADDv2i32:
		case AArch64::ADDv4i32:
		case AArch64::SUBv8i8:
		case AArch64::SUBv16i8:
		case AArch64::SUBv4i16:
		case AArch64::SUBv8i16:
		case AArch64::SUBv2i32:
		case AArch64::SUBv4i32:
		return true;
		default:
		break;
		@@ -3713,6 +3725,13 @@ static bool getMaddPatterns(MachineInstr &Root,
		}
		};

		auto setVFound = [&](int Opcode, int Operand, MachineCombinerPattern Pattern) {
		if (canCombine(MBB, Root.getOperand(Operand), Opcode)) {
		Patterns.push_back(Pattern);
		Found = true;
		}
		};

		typedef MachineCombinerPattern MCP;

		switch (Opc) {
		@@ -3748,6 +3767,70 @@ static bool getMaddPatterns(MachineInstr &Root,
		case AArch64::SUBXri:
		setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBXI_OP1);
		break;
		case AArch64::ADDv8i8:
		setVFound(AArch64::MULv8i8, 1, MCP::MULADDv8i8_OP1);
		setVFound(AArch64::MULv8i8, 2, MCP::MULADDv8i8_OP2);
		break;
		case AArch64::ADDv16i8:
		setVFound(AArch64::MULv16i8, 1, MCP::MULADDv16i8_OP1);
		setVFound(AArch64::MULv16i8, 2, MCP::MULADDv16i8_OP2);
		break;
		case AArch64::ADDv4i16:
		setVFound(AArch64::MULv4i16, 1, MCP::MULADDv4i16_OP1);
		setVFound(AArch64::MULv4i16, 2, MCP::MULADDv4i16_OP2);
		setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULADDv4i16_indexed_OP1);
		setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULADDv4i16_indexed_OP2);
		break;
		case AArch64::ADDv8i16:
		setVFound(AArch64::MULv8i16, 1, MCP::MULADDv8i16_OP1);
		setVFound(AArch64::MULv8i16, 2, MCP::MULADDv8i16_OP2);
		setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULADDv8i16_indexed_OP1);
		setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULADDv8i16_indexed_OP2);
		break;
		case AArch64::ADDv2i32:
		setVFound(AArch64::MULv2i32, 1, MCP::MULADDv2i32_OP1);
		setVFound(AArch64::MULv2i32, 2, MCP::MULADDv2i32_OP2);
		setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULADDv2i32_indexed_OP1);
		setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULADDv2i32_indexed_OP2);
		break;
		case AArch64::ADDv4i32:
		setVFound(AArch64::MULv4i32, 1, MCP::MULADDv4i32_OP1);
		setVFound(AArch64::MULv4i32, 2, MCP::MULADDv4i32_OP2);
		setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULADDv4i32_indexed_OP1);
		setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULADDv4i32_indexed_OP2);
		break;
		case AArch64::SUBv8i8:
		setVFound(AArch64::MULv8i8, 1, MCP::MULSUBv8i8_OP1);
		setVFound(AArch64::MULv8i8, 2, MCP::MULSUBv8i8_OP2);
		break;
		case AArch64::SUBv16i8:
		setVFound(AArch64::MULv16i8, 1, MCP::MULSUBv16i8_OP1);
		setVFound(AArch64::MULv16i8, 2, MCP::MULSUBv16i8_OP2);
		break;
		case AArch64::SUBv4i16:
		setVFound(AArch64::MULv4i16, 1, MCP::MULSUBv4i16_OP1);
		setVFound(AArch64::MULv4i16, 2, MCP::MULSUBv4i16_OP2);
		setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULSUBv4i16_indexed_OP1);
		setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULSUBv4i16_indexed_OP2);
		break;
		case AArch64::SUBv8i16:
		setVFound(AArch64::MULv8i16, 1, MCP::MULSUBv8i16_OP1);
		setVFound(AArch64::MULv8i16, 2, MCP::MULSUBv8i16_OP2);
		setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULSUBv8i16_indexed_OP1);
		setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULSUBv8i16_indexed_OP2);
		break;
		case AArch64::SUBv2i32:
		setVFound(AArch64::MULv2i32, 1, MCP::MULSUBv2i32_OP1);
		setVFound(AArch64::MULv2i32, 2, MCP::MULSUBv2i32_OP2);
		setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULSUBv2i32_indexed_OP1);
		setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULSUBv2i32_indexed_OP2);
		break;
		case AArch64::SUBv4i32:
		setVFound(AArch64::MULv4i32, 1, MCP::MULSUBv4i32_OP1);
		setVFound(AArch64::MULv4i32, 2, MCP::MULSUBv4i32_OP2);
		setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULSUBv4i32_indexed_OP1);
		setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULSUBv4i32_indexed_OP2);
		break;
		}
		return Found;
		}
		@@ -3960,6 +4043,46 @@ bool AArch64InstrInfo::isThroughputPattern(
		case MachineCombinerPattern::FMLSv2f64_OP2:
		case MachineCombinerPattern::FMLSv4i32_indexed_OP2:
		case MachineCombinerPattern::FMLSv4f32_OP2:
		case MachineCombinerPattern::MULADDv8i8_OP1:
		case MachineCombinerPattern::MULADDv8i8_OP2:
		case MachineCombinerPattern::MULADDv16i8_OP1:
		case MachineCombinerPattern::MULADDv16i8_OP2:
		case MachineCombinerPattern::MULADDv4i16_OP1:
		case MachineCombinerPattern::MULADDv4i16_OP2:
		case MachineCombinerPattern::MULADDv8i16_OP1:
		case MachineCombinerPattern::MULADDv8i16_OP2:
		case MachineCombinerPattern::MULADDv2i32_OP1:
		case MachineCombinerPattern::MULADDv2i32_OP2:
		case MachineCombinerPattern::MULADDv4i32_OP1:
		case MachineCombinerPattern::MULADDv4i32_OP2:
		case MachineCombinerPattern::MULSUBv8i8_OP1:
		case MachineCombinerPattern::MULSUBv8i8_OP2:
		case MachineCombinerPattern::MULSUBv16i8_OP1:
		case MachineCombinerPattern::MULSUBv16i8_OP2:
		case MachineCombinerPattern::MULSUBv4i16_OP1:
		case MachineCombinerPattern::MULSUBv4i16_OP2:
		case MachineCombinerPattern::MULSUBv8i16_OP1:
		case MachineCombinerPattern::MULSUBv8i16_OP2:
		case MachineCombinerPattern::MULSUBv2i32_OP1:
		case MachineCombinerPattern::MULSUBv2i32_OP2:
		case MachineCombinerPattern::MULSUBv4i32_OP1:
		case MachineCombinerPattern::MULSUBv4i32_OP2:
		case MachineCombinerPattern::MULADDv4i16_indexed_OP1:
		case MachineCombinerPattern::MULADDv4i16_indexed_OP2:
		case MachineCombinerPattern::MULADDv8i16_indexed_OP1:
		case MachineCombinerPattern::MULADDv8i16_indexed_OP2:
		case MachineCombinerPattern::MULADDv2i32_indexed_OP1:
		case MachineCombinerPattern::MULADDv2i32_indexed_OP2:
		case MachineCombinerPattern::MULADDv4i32_indexed_OP1:
		case MachineCombinerPattern::MULADDv4i32_indexed_OP2:
		case MachineCombinerPattern::MULSUBv4i16_indexed_OP1:
		case MachineCombinerPattern::MULSUBv4i16_indexed_OP2:
		case MachineCombinerPattern::MULSUBv8i16_indexed_OP1:
		case MachineCombinerPattern::MULSUBv8i16_indexed_OP2:
		case MachineCombinerPattern::MULSUBv2i32_indexed_OP1:
		case MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
		case MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
		case MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
		return true;
		} // end switch (Pattern)
		return false;
		@@ -4063,6 +4186,30 @@ genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI,
		return MUL;
		}

		/// genFusedMultiplyAcc - Helper to generate fused multiply accumulate
		/// instructions.
		///
		/// \see genFusedMultiply
		static MachineInstr *genFusedMultiplyAcc(
		MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
		MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
		unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
		return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
		FMAInstKind::Accumulator);
		}

		/// genFusedMultiplyIdx - Helper to generate fused multiply accumulate
		/// instructions.
		///
		/// \see genFusedMultiply
		static MachineInstr *genFusedMultiplyIdx(
		MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
		MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
		unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
		return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
		FMAInstKind::Indexed);
		}

		/// genMaddR - Generate madd instruction and combine mul and add using
		/// an extra virtual register
		/// Example - an ADD intermediate needs to be stored in a register:
		@@ -4302,6 +4449,211 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
		}
		break;
		}

		case MachineCombinerPattern::MULADDv8i8_OP1:
		Opc = AArch64::MLAv8i8;
		RC = &AArch64::FPR64RegClass;
		MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
		break;
		case MachineCombinerPattern::MULADDv8i8_OP2:
		Opc = AArch64::MLAv8i8;
		RC = &AArch64::FPR64RegClass;
		MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
		break;
		case MachineCombinerPattern::MULADDv16i8_OP1:
		Opc = AArch64::MLAv16i8;
		RC = &AArch64::FPR128RegClass;
		MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
		break;
		case MachineCombinerPattern::MULADDv16i8_OP2:
		Opc = AArch64::MLAv16i8;
		RC = &AArch64::FPR128RegClass;
		MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
		break;
		case MachineCombinerPattern::MULADDv4i16_OP1:
		Opc = AArch64::MLAv4i16;
		RC = &AArch64::FPR64RegClass;
		MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
		break;
		case MachineCombinerPattern::MULADDv4i16_OP2:
		Opc = AArch64::MLAv4i16;
		RC = &AArch64::FPR64RegClass;
		MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
		break;
		case MachineCombinerPattern::MULADDv8i16_OP1:
		Opc = AArch64::MLAv8i16;
		RC = &AArch64::FPR128RegClass;
		MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
		break;
		case MachineCombinerPattern::MULADDv8i16_OP2:
		Opc = AArch64::MLAv8i16;
		RC = &AArch64::FPR128RegClass;
		MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
		break;
		case MachineCombinerPattern::MULADDv2i32_OP1:
		Opc = AArch64::MLAv2i32;
		RC = &AArch64::FPR64RegClass;
		MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
		break;
		case MachineCombinerPattern::MULADDv2i32_OP2:
		Opc = AArch64::MLAv2i32;
		RC = &AArch64::FPR64RegClass;
		MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
		break;
		case MachineCombinerPattern::MULADDv4i32_OP1:
		Opc = AArch64::MLAv4i32;
		RC = &AArch64::FPR128RegClass;
		MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
		break;
		case MachineCombinerPattern::MULADDv4i32_OP2:
		Opc = AArch64::MLAv4i32;
		RC = &AArch64::FPR128RegClass;
		MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
		break;

		case MachineCombinerPattern::MULSUBv8i8_OP1:
		Opc = AArch64::MLSv8i8;
		RC = &AArch64::FPR64RegClass;
		MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
		break;
		case MachineCombinerPattern::MULSUBv8i8_OP2:
		Opc = AArch64::MLSv8i8;
		RC = &AArch64::FPR64RegClass;
		MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
		break;
		case MachineCombinerPattern::MULSUBv16i8_OP1:
		Opc = AArch64::MLSv16i8;
		RC = &AArch64::FPR128RegClass;
		MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
		break;
		case MachineCombinerPattern::MULSUBv16i8_OP2:
		Opc = AArch64::MLSv16i8;
		RC = &AArch64::FPR128RegClass;
		MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
		break;
		case MachineCombinerPattern::MULSUBv4i16_OP1:
		Opc = AArch64::MLSv4i16;
		RC = &AArch64::FPR64RegClass;
		MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
		break;
		case MachineCombinerPattern::MULSUBv4i16_OP2:
		Opc = AArch64::MLSv4i16;
		RC = &AArch64::FPR64RegClass;
		MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
		break;
		case MachineCombinerPattern::MULSUBv8i16_OP1:
		Opc = AArch64::MLSv8i16;
		RC = &AArch64::FPR128RegClass;
		MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
		break;
		case MachineCombinerPattern::MULSUBv8i16_OP2:
		Opc = AArch64::MLSv8i16;
		RC = &AArch64::FPR128RegClass;
		MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
		break;
		case MachineCombinerPattern::MULSUBv2i32_OP1:
		Opc = AArch64::MLSv2i32;
		RC = &AArch64::FPR64RegClass;
		MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
		break;
		case MachineCombinerPattern::MULSUBv2i32_OP2:
		Opc = AArch64::MLSv2i32;
		RC = &AArch64::FPR64RegClass;
		MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
		break;
		case MachineCombinerPattern::MULSUBv4i32_OP1:
		Opc = AArch64::MLSv4i32;
		RC = &AArch64::FPR128RegClass;
		MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
		break;
		case MachineCombinerPattern::MULSUBv4i32_OP2:
		Opc = AArch64::MLSv4i32;
		RC = &AArch64::FPR128RegClass;
		MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
		break;

		case MachineCombinerPattern::MULADDv4i16_indexed_OP1:
		Opc = AArch64::MLAv4i16_indexed;
		RC = &AArch64::FPR64RegClass;
		MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
		break;
		case MachineCombinerPattern::MULADDv4i16_indexed_OP2:
		Opc = AArch64::MLAv4i16_indexed;
		RC = &AArch64::FPR64RegClass;
		MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
		break;
		case MachineCombinerPattern::MULADDv8i16_indexed_OP1:
		Opc = AArch64::MLAv8i16_indexed;
		RC = &AArch64::FPR128RegClass;
		MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
		break;
		case MachineCombinerPattern::MULADDv8i16_indexed_OP2:
		Opc = AArch64::MLAv8i16_indexed;
		RC = &AArch64::FPR128RegClass;
		MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
		break;
		case MachineCombinerPattern::MULADDv2i32_indexed_OP1:
		Opc = AArch64::MLAv2i32_indexed;
		RC = &AArch64::FPR64RegClass;
		MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
		break;
		case MachineCombinerPattern::MULADDv2i32_indexed_OP2:
		Opc = AArch64::MLAv2i32_indexed;
		RC = &AArch64::FPR64RegClass;
		MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
		break;
		case MachineCombinerPattern::MULADDv4i32_indexed_OP1:
		Opc = AArch64::MLAv4i32_indexed;
		RC = &AArch64::FPR128RegClass;
		MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
		break;
		case MachineCombinerPattern::MULADDv4i32_indexed_OP2:
		Opc = AArch64::MLAv4i32_indexed;
		RC = &AArch64::FPR128RegClass;
		MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
		break;

		case MachineCombinerPattern::MULSUBv4i16_indexed_OP1:
		Opc = AArch64::MLSv4i16_indexed;
		RC = &AArch64::FPR64RegClass;
		MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
		break;
		case MachineCombinerPattern::MULSUBv4i16_indexed_OP2:
		Opc = AArch64::MLSv4i16_indexed;
		RC = &AArch64::FPR64RegClass;
		MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
		break;
		case MachineCombinerPattern::MULSUBv8i16_indexed_OP1:
		Opc = AArch64::MLSv8i16_indexed;
		RC = &AArch64::FPR128RegClass;
		MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
		break;
		case MachineCombinerPattern::MULSUBv8i16_indexed_OP2:
		Opc = AArch64::MLSv8i16_indexed;
		RC = &AArch64::FPR128RegClass;
		MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
		break;
		case MachineCombinerPattern::MULSUBv2i32_indexed_OP1:
		Opc = AArch64::MLSv2i32_indexed;
		RC = &AArch64::FPR64RegClass;
		MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
		break;
		case MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
		Opc = AArch64::MLSv2i32_indexed;
		RC = &AArch64::FPR64RegClass;
		MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
		break;
		case MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
		Opc = AArch64::MLSv4i32_indexed;
		RC = &AArch64::FPR128RegClass;
		MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
		break;
		case MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
		Opc = AArch64::MLSv4i32_indexed;
		RC = &AArch64::FPR128RegClass;
		MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
		break;

		// Floating Point Support
		case MachineCombinerPattern::FMULADDH_OP1:
		Opc = AArch64::FMADDHrrr;

llvm/lib/Target/AArch64/AArch64InstrInfo.td

+10 −8

Original line number	Diff line number	Diff line
		@@ -3824,10 +3824,11 @@ defm FMUL : SIMDThreeSameVectorFP<1,0,0b011,"fmul", fmul>;
		defm FRECPS : SIMDThreeSameVectorFP<0,0,0b111,"frecps", int_aarch64_neon_frecps>;
		defm FRSQRTS : SIMDThreeSameVectorFP<0,1,0b111,"frsqrts", int_aarch64_neon_frsqrts>;
		defm FSUB : SIMDThreeSameVectorFP<0,1,0b010,"fsub", fsub>;
		defm MLA : SIMDThreeSameVectorBHSTied<0, 0b10010, "mla",
		TriOpFrag<(add node:$LHS, (mul node:$MHS, node:$RHS))> >;
		defm MLS : SIMDThreeSameVectorBHSTied<1, 0b10010, "mls",
		TriOpFrag<(sub node:$LHS, (mul node:$MHS, node:$RHS))> >;

		// MLA and MLS are generated in MachineCombine
		defm MLA : SIMDThreeSameVectorBHSTied<0, 0b10010, "mla", null_frag>;
		defm MLS : SIMDThreeSameVectorBHSTied<1, 0b10010, "mls", null_frag>;

		defm MUL : SIMDThreeSameVectorBHS<0, 0b10011, "mul", mul>;
		defm PMUL : SIMDThreeSameVectorB<1, 0b10011, "pmul", int_aarch64_neon_pmul>;
		defm SABA : SIMDThreeSameVectorBHSTied<0, 0b01111, "saba",
		@@ -5557,10 +5558,11 @@ def : Pat<(v2f64 (fmul V128:$Rn, (AArch64dup (f64 FPR64:$Rm)))),

		defm SQDMULH : SIMDIndexedHS<0, 0b1100, "sqdmulh", int_aarch64_neon_sqdmulh>;
		defm SQRDMULH : SIMDIndexedHS<0, 0b1101, "sqrdmulh", int_aarch64_neon_sqrdmulh>;
		defm MLA : SIMDVectorIndexedHSTied<1, 0b0000, "mla",
		TriOpFrag<(add node:$LHS, (mul node:$MHS, node:$RHS))>>;
		defm MLS : SIMDVectorIndexedHSTied<1, 0b0100, "mls",
		TriOpFrag<(sub node:$LHS, (mul node:$MHS, node:$RHS))>>;

		// Generated by MachineCombine
		defm MLA : SIMDVectorIndexedHSTied<1, 0b0000, "mla", null_frag>;
		defm MLS : SIMDVectorIndexedHSTied<1, 0b0100, "mls", null_frag>;

		defm MUL : SIMDVectorIndexedHS<0, 0b1000, "mul", mul>;
		defm SMLAL : SIMDVectorIndexedLongSDTied<0, 0b0010, "smlal",
		TriOpFrag<(add node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;

llvm/test/CodeGen/AArch64/GlobalISel/select-with-no-legality-check.mir

+36 −24

File changed.

Preview size limit exceeded, changes collapsed.

llvm/test/CodeGen/AArch64/overeager_mla_fusing.ll

+7 −7

Original line number	Diff line number	Diff line
		@@ -5,17 +5,17 @@ define dso_local void @jsimd_idct_ifast_neon_intrinsic(i8* nocapture readonly %d
		; CHECK-LABEL: jsimd_idct_ifast_neon_intrinsic:
		; CHECK: // %bb.0: // %entry
		; CHECK-NEXT: ldr q0, [x1, #32]
		; CHECK-NEXT: ldr q1, [x0, #32]
		; CHECK-NEXT: ldr q2, [x1, #96]
		; CHECK-NEXT: ldr q1, [x1, #96]
		; CHECK-NEXT: ldr q2, [x0, #32]
		; CHECK-NEXT: ldr q3, [x0, #96]
		; CHECK-NEXT: ldr x8, [x2, #48]
		; CHECK-NEXT: mul v0.8h, v1.8h, v0.8h
		; CHECK-NEXT: mov v1.16b, v0.16b
		; CHECK-NEXT: mla v1.8h, v3.8h, v2.8h
		; CHECK-NEXT: mov w9, w3
		; CHECK-NEXT: str q1, [x8, x9]
		; CHECK-NEXT: mul v0.8h, v2.8h, v0.8h
		; CHECK-NEXT: mul v1.8h, v3.8h, v1.8h
		; CHECK-NEXT: add v2.8h, v0.8h, v1.8h
		; CHECK-NEXT: str q2, [x8, x9]
		; CHECK-NEXT: ldr x8, [x2, #56]
		; CHECK-NEXT: mls v0.8h, v3.8h, v2.8h
		; CHECK-NEXT: sub v0.8h, v0.8h, v1.8h
		; CHECK-NEXT: str q0, [x8, x9]
		; CHECK-NEXT: ret
		entry: