Implement IR intrinsics for gather prefetch. (0f2b68d9) · Commits · llvm-doe / llvm-project

llvm/include/llvm/IR/IntrinsicsAArch64.td

+48 −1

Original line number	Diff line number	Diff line
		@@ -1263,6 +1263,27 @@ class AdvSIMD_ScatterStore_VS_Intrinsic
		],
		[IntrWriteMem, IntrArgMemOnly]>;


		class SVE_gather_prf_scalar_base_vector_offset_scaled
		: Intrinsic<[],
		[
		LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, // Predicate
		llvm_ptr_ty, // Base address
		llvm_anyvector_ty, // Offsets
		llvm_i32_ty // Prfop
		],
		[IntrInaccessibleMemOrArgMemOnly, NoCapture<1>, ImmArg<3>]>;

		class SVE_gather_prf_vector_base_scalar_offset
		: Intrinsic<[],
		[
		LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, // Predicate
		llvm_anyvector_ty, // Base addresses
		llvm_i64_ty, // Scalar offset
		llvm_i32_ty // Prfop
		],
		[IntrInaccessibleMemOrArgMemOnly, ImmArg<3>]>;

		//
		// Loads
		//
		@@ -1279,13 +1300,39 @@ def int_aarch64_sve_ldff1 : AdvSIMD_1Vec_PredFaultingLoad_Intrinsic;
		def int_aarch64_sve_stnt1 : AdvSIMD_1Vec_PredStore_Intrinsic;

		//
		// Prefetch
		// Prefetches
		//

		def int_aarch64_sve_prf
		: Intrinsic<[], [llvm_anyvector_ty, llvm_ptr_ty, llvm_i32_ty],
		[IntrArgMemOnly, ImmArg<2>]>;

		// Scalar + 32-bit scaled offset vector, zero extend, packed and
		// unpacked.
		def int_aarch64_sve_gather_prfb_scaled_uxtw : SVE_gather_prf_scalar_base_vector_offset_scaled;
		def int_aarch64_sve_gather_prfh_scaled_uxtw : SVE_gather_prf_scalar_base_vector_offset_scaled;
		def int_aarch64_sve_gather_prfw_scaled_uxtw : SVE_gather_prf_scalar_base_vector_offset_scaled;
		def int_aarch64_sve_gather_prfd_scaled_uxtw : SVE_gather_prf_scalar_base_vector_offset_scaled;

		// Scalar + 32-bit scaled offset vector, sign extend, packed and
		// unpacked.
		def int_aarch64_sve_gather_prfb_scaled_sxtw : SVE_gather_prf_scalar_base_vector_offset_scaled;
		def int_aarch64_sve_gather_prfw_scaled_sxtw : SVE_gather_prf_scalar_base_vector_offset_scaled;
		def int_aarch64_sve_gather_prfh_scaled_sxtw : SVE_gather_prf_scalar_base_vector_offset_scaled;
		def int_aarch64_sve_gather_prfd_scaled_sxtw : SVE_gather_prf_scalar_base_vector_offset_scaled;

		// Scalar + 64-bit scaled offset vector.
		def int_aarch64_sve_gather_prfb_scaled : SVE_gather_prf_scalar_base_vector_offset_scaled;
		def int_aarch64_sve_gather_prfh_scaled : SVE_gather_prf_scalar_base_vector_offset_scaled;
		def int_aarch64_sve_gather_prfw_scaled : SVE_gather_prf_scalar_base_vector_offset_scaled;
		def int_aarch64_sve_gather_prfd_scaled : SVE_gather_prf_scalar_base_vector_offset_scaled;

		// Vector + scalar.
		def int_aarch64_sve_gather_prfb : SVE_gather_prf_vector_base_scalar_offset;
		def int_aarch64_sve_gather_prfh : SVE_gather_prf_vector_base_scalar_offset;
		def int_aarch64_sve_gather_prfw : SVE_gather_prf_vector_base_scalar_offset;
		def int_aarch64_sve_gather_prfd : SVE_gather_prf_vector_base_scalar_offset;

		//
		// Scalar to vector operations
		//

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+90 −16

Original line number	Diff line number	Diff line
		@@ -12646,6 +12646,20 @@ static SDValue getScaledOffsetForBitWidth(SelectionDAG &DAG, SDValue Offset,
		return DAG.getNode(ISD::SHL, DL, MVT::nxv2i64, Offset, SplatShift);
		}

		/// Check if the value of \p Offset represents a valid immediate for the SVE
		/// gather load/prefetch and scatter store instructiona with vector base and
		/// immediate offset addressing mode:
		///
		/// [<Zn>.[S\|D]{, #<imm>}]
		///
		/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
		static bool isValidImmForSVEVecImmAddrMode(SDValue Offset,
		unsigned ScalarSizeInBytes) {
		ConstantSDNode *OffsetConst = dyn_cast<ConstantSDNode>(Offset.getNode());
		return OffsetConst && AArch64_AM::isValidImmForSVEVecImmAddrMode(
		OffsetConst->getZExtValue(), ScalarSizeInBytes);
		}

		static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG,
		unsigned Opcode,
		bool OnlyPackedOffsets = true) {
		@@ -12697,13 +12711,9 @@ static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG,
		// immediates outside that range and non-immediate scalar offsets use SST1 or
		// SST1_UXTW instead.
		if (Opcode == AArch64ISD::SST1_IMM) {
		uint64_t MaxIndex = 31;
		uint64_t SrcElSize = SrcElVT.getStoreSize().getKnownMinSize();

		ConstantSDNode *OffsetConst = dyn_cast<ConstantSDNode>(Offset.getNode());
		if (nullptr == OffsetConst \|\|
		OffsetConst->getZExtValue() > MaxIndex * SrcElSize \|\|
		OffsetConst->getZExtValue() % SrcElSize) {
		if (!isValidImmForSVEVecImmAddrMode(Offset,
		SrcVT.getScalarSizeInBits() / 8)) {
		if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
		Opcode = AArch64ISD::SST1_UXTW;
		else
		@@ -12763,7 +12773,6 @@ static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG,
		"Gather loads are only possible for SVE vectors");

		SDLoc DL(N);
		MVT RetElVT = RetVT.getVectorElementType().getSimpleVT();

		// Make sure that the loaded data will fit into an SVE register
		if (RetVT.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock)
		@@ -12780,8 +12789,8 @@ static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG,
		// applies to non-temporal gathers because there's no instruction that takes
		// indicies.
		if (Opcode == AArch64ISD::GLDNT1_INDEX) {
		Offset =
		getScaledOffsetForBitWidth(DAG, Offset, DL, RetElVT.getSizeInBits());
		Offset = getScaledOffsetForBitWidth(DAG, Offset, DL,
		RetVT.getScalarSizeInBits());
		Opcode = AArch64ISD::GLDNT1;
		}

		@@ -12800,13 +12809,8 @@ static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG,
		// immediates outside that range and non-immediate scalar offsets use GLD1 or
		// GLD1_UXTW instead.
		if (Opcode == AArch64ISD::GLD1_IMM \|\| Opcode == AArch64ISD::GLDFF1_IMM) {
		uint64_t MaxIndex = 31;
		uint64_t RetElSize = RetElVT.getStoreSize().getKnownMinSize();

		ConstantSDNode *OffsetConst = dyn_cast<ConstantSDNode>(Offset.getNode());
		if (nullptr == OffsetConst \|\|
		OffsetConst->getZExtValue() > MaxIndex * RetElSize \|\|
		OffsetConst->getZExtValue() % RetElSize) {
		if (!isValidImmForSVEVecImmAddrMode(Offset,
		RetVT.getScalarSizeInBits() / 8)) {
		if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
		Opcode = (Opcode == AArch64ISD::GLD1_IMM) ? AArch64ISD::GLD1_UXTW
		: AArch64ISD::GLDFF1_UXTW;
		@@ -12950,6 +12954,51 @@ performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
		return SDValue(N, 0);
		}

		/// Legalize the gather prefetch (scalar + vector addressing mode) when the
		/// offset vector is an unpacked 32-bit scalable vector. The other cases (Offset
		/// != nxv2i32) do not need legalization.
		static SDValue legalizeSVEGatherPrefetchOffsVec(SDNode *N, SelectionDAG &DAG) {
		const unsigned OffsetPos = 4;
		SDValue Offset = N->getOperand(OffsetPos);

		// Not an unpacked vector, bail out.
		if (Offset.getValueType().getSimpleVT().SimpleTy != MVT::nxv2i32)
		return SDValue();

		// Extend the unpacked offset vector to 64-bit lanes.
		SDLoc DL(N);
		Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset);
		SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end());
		// Replace the offset operand with the 64-bit one.
		Ops[OffsetPos] = Offset;

		return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
		}

		/// Combines a node carrying the intrinsic `aarch64_sve_gather_prf<T>` into a
		/// node that uses `aarch64_sve_gather_prf<T>_scaled_uxtw` when the scalar
		/// offset passed to `aarch64_sve_gather_prf<T>` is not a valid immediate for
		/// the sve gather prefetch instruction with vector plus immediate addressing
		/// mode.
		static SDValue combineSVEPrefetchVecBaseImmOff(SDNode *N, SelectionDAG &DAG,
		unsigned NewIID,
		unsigned ScalarSizeInBytes) {
		const unsigned ImmPos = 4, OffsetPos = 3;
		// No need to combine the node if the immediate is valid...
		if (isValidImmForSVEVecImmAddrMode(N->getOperand(ImmPos), ScalarSizeInBytes))
		return SDValue();

		// ...otherwise swap the offset base with the offset...
		SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end());
		std::swap(Ops[ImmPos], Ops[OffsetPos]);
		// ...and remap the intrinsic `aarch64_sve_gather_prf<T>` to
		// `aarch64_sve_gather_prf<T>_scaled_uxtw`.
		SDLoc DL(N);
		Ops[1] = DAG.getConstant(NewIID, DL, MVT::i64);

		return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
		}

		SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
		DAGCombinerInfo &DCI) const {
		SelectionDAG &DAG = DCI.DAG;
		@@ -13014,6 +13063,31 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
		case ISD::INTRINSIC_VOID:
		case ISD::INTRINSIC_W_CHAIN:
		switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
		case Intrinsic::aarch64_sve_gather_prfb:
		return combineSVEPrefetchVecBaseImmOff(
		N, DAG, Intrinsic::aarch64_sve_gather_prfb_scaled_uxtw,
		1 /=ScalarSizeInBytes/);
		case Intrinsic::aarch64_sve_gather_prfh:
		return combineSVEPrefetchVecBaseImmOff(
		N, DAG, Intrinsic::aarch64_sve_gather_prfh_scaled_uxtw,
		2 /=ScalarSizeInBytes/);
		case Intrinsic::aarch64_sve_gather_prfw:
		return combineSVEPrefetchVecBaseImmOff(
		N, DAG, Intrinsic::aarch64_sve_gather_prfw_scaled_uxtw,
		4 /=ScalarSizeInBytes/);
		case Intrinsic::aarch64_sve_gather_prfd:
		return combineSVEPrefetchVecBaseImmOff(
		N, DAG, Intrinsic::aarch64_sve_gather_prfd_scaled_uxtw,
		8 /=ScalarSizeInBytes/);
		case Intrinsic::aarch64_sve_gather_prfb_scaled_uxtw:
		case Intrinsic::aarch64_sve_gather_prfb_scaled_sxtw:
		case Intrinsic::aarch64_sve_gather_prfh_scaled_uxtw:
		case Intrinsic::aarch64_sve_gather_prfh_scaled_sxtw:
		case Intrinsic::aarch64_sve_gather_prfw_scaled_uxtw:
		case Intrinsic::aarch64_sve_gather_prfw_scaled_sxtw:
		case Intrinsic::aarch64_sve_gather_prfd_scaled_uxtw:
		case Intrinsic::aarch64_sve_gather_prfd_scaled_sxtw:
		return legalizeSVEGatherPrefetchOffsVec(N, DAG);
		case Intrinsic::aarch64_neon_ld2:
		case Intrinsic::aarch64_neon_ld3:
		case Intrinsic::aarch64_neon_ld4:

llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td

+21 −21

Original line number	Diff line number	Diff line
		@@ -880,37 +880,37 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio

		// Gather prefetch using scaled 32-bit offsets, e.g.
		// prfh pldl1keep, p0, [x0, z0.s, uxtw #1]
		defm PRFB_S : sve_mem_32b_prfm_sv_scaled<0b00, "prfb", ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only>;
		defm PRFH_S : sve_mem_32b_prfm_sv_scaled<0b01, "prfh", ZPR32ExtSXTW16, ZPR32ExtUXTW16>;
		defm PRFW_S : sve_mem_32b_prfm_sv_scaled<0b10, "prfw", ZPR32ExtSXTW32, ZPR32ExtUXTW32>;
		defm PRFD_S : sve_mem_32b_prfm_sv_scaled<0b11, "prfd", ZPR32ExtSXTW64, ZPR32ExtUXTW64>;
		defm PRFB_S : sve_mem_32b_prfm_sv_scaled<0b00, "prfb", ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, int_aarch64_sve_gather_prfb_scaled_sxtw, int_aarch64_sve_gather_prfb_scaled_uxtw>;
		defm PRFH_S : sve_mem_32b_prfm_sv_scaled<0b01, "prfh", ZPR32ExtSXTW16, ZPR32ExtUXTW16, int_aarch64_sve_gather_prfh_scaled_sxtw, int_aarch64_sve_gather_prfh_scaled_uxtw>;
		defm PRFW_S : sve_mem_32b_prfm_sv_scaled<0b10, "prfw", ZPR32ExtSXTW32, ZPR32ExtUXTW32, int_aarch64_sve_gather_prfw_scaled_sxtw, int_aarch64_sve_gather_prfw_scaled_uxtw>;
		defm PRFD_S : sve_mem_32b_prfm_sv_scaled<0b11, "prfd", ZPR32ExtSXTW64, ZPR32ExtUXTW64, int_aarch64_sve_gather_prfd_scaled_sxtw, int_aarch64_sve_gather_prfd_scaled_uxtw>;

		// Gather prefetch using unpacked, scaled 32-bit offsets, e.g.
		// prfh pldl1keep, p0, [x0, z0.d, uxtw #1]
		defm PRFB_D : sve_mem_64b_prfm_sv_ext_scaled<0b00, "prfb", ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only>;
		defm PRFH_D : sve_mem_64b_prfm_sv_ext_scaled<0b01, "prfh", ZPR64ExtSXTW16, ZPR64ExtUXTW16>;
		defm PRFW_D : sve_mem_64b_prfm_sv_ext_scaled<0b10, "prfw", ZPR64ExtSXTW32, ZPR64ExtUXTW32>;
		defm PRFD_D : sve_mem_64b_prfm_sv_ext_scaled<0b11, "prfd", ZPR64ExtSXTW64, ZPR64ExtUXTW64>;
		defm PRFB_D : sve_mem_64b_prfm_sv_ext_scaled<0b00, "prfb", ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, int_aarch64_sve_gather_prfb_scaled_sxtw, int_aarch64_sve_gather_prfb_scaled_uxtw>;
		defm PRFH_D : sve_mem_64b_prfm_sv_ext_scaled<0b01, "prfh", ZPR64ExtSXTW16, ZPR64ExtUXTW16, int_aarch64_sve_gather_prfh_scaled_sxtw, int_aarch64_sve_gather_prfh_scaled_uxtw>;
		defm PRFW_D : sve_mem_64b_prfm_sv_ext_scaled<0b10, "prfw", ZPR64ExtSXTW32, ZPR64ExtUXTW32, int_aarch64_sve_gather_prfw_scaled_sxtw, int_aarch64_sve_gather_prfw_scaled_uxtw>;
		defm PRFD_D : sve_mem_64b_prfm_sv_ext_scaled<0b11, "prfd", ZPR64ExtSXTW64, ZPR64ExtUXTW64, int_aarch64_sve_gather_prfd_scaled_sxtw, int_aarch64_sve_gather_prfd_scaled_uxtw>;

		// Gather prefetch using scaled 64-bit offsets, e.g.
		// prfh pldl1keep, p0, [x0, z0.d, lsl #1]
		defm PRFB_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b00, "prfb", ZPR64ExtLSL8>;
		defm PRFH_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b01, "prfh", ZPR64ExtLSL16>;
		defm PRFW_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b10, "prfw", ZPR64ExtLSL32>;
		defm PRFD_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b11, "prfd", ZPR64ExtLSL64>;
		defm PRFB_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b00, "prfb", ZPR64ExtLSL8, int_aarch64_sve_gather_prfb_scaled>;
		defm PRFH_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b01, "prfh", ZPR64ExtLSL16, int_aarch64_sve_gather_prfh_scaled>;
		defm PRFW_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b10, "prfw", ZPR64ExtLSL32, int_aarch64_sve_gather_prfw_scaled>;
		defm PRFD_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b11, "prfd", ZPR64ExtLSL64, int_aarch64_sve_gather_prfd_scaled>;

		// Gather prefetch using 32/64-bit pointers with offset, e.g.
		// prfh pldl1keep, p0, [z0.s, #16]
		// prfh pldl1keep, p0, [z0.d, #16]
		defm PRFB_S_PZI : sve_mem_32b_prfm_vi<0b00, "prfb", imm0_31>;
		defm PRFH_S_PZI : sve_mem_32b_prfm_vi<0b01, "prfh", uimm5s2>;
		defm PRFW_S_PZI : sve_mem_32b_prfm_vi<0b10, "prfw", uimm5s4>;
		defm PRFD_S_PZI : sve_mem_32b_prfm_vi<0b11, "prfd", uimm5s8>;

		defm PRFB_D_PZI : sve_mem_64b_prfm_vi<0b00, "prfb", imm0_31>;
		defm PRFH_D_PZI : sve_mem_64b_prfm_vi<0b01, "prfh", uimm5s2>;
		defm PRFW_D_PZI : sve_mem_64b_prfm_vi<0b10, "prfw", uimm5s4>;
		defm PRFD_D_PZI : sve_mem_64b_prfm_vi<0b11, "prfd", uimm5s8>;
		defm PRFB_S_PZI : sve_mem_32b_prfm_vi<0b00, "prfb", imm0_31, int_aarch64_sve_gather_prfb>;
		defm PRFH_S_PZI : sve_mem_32b_prfm_vi<0b01, "prfh", uimm5s2, int_aarch64_sve_gather_prfh>;
		defm PRFW_S_PZI : sve_mem_32b_prfm_vi<0b10, "prfw", uimm5s4, int_aarch64_sve_gather_prfw>;
		defm PRFD_S_PZI : sve_mem_32b_prfm_vi<0b11, "prfd", uimm5s8, int_aarch64_sve_gather_prfd>;

		defm PRFB_D_PZI : sve_mem_64b_prfm_vi<0b00, "prfb", imm0_31, int_aarch64_sve_gather_prfb>;
		defm PRFH_D_PZI : sve_mem_64b_prfm_vi<0b01, "prfh", uimm5s2, int_aarch64_sve_gather_prfh>;
		defm PRFW_D_PZI : sve_mem_64b_prfm_vi<0b10, "prfw", uimm5s4, int_aarch64_sve_gather_prfw>;
		defm PRFD_D_PZI : sve_mem_64b_prfm_vi<0b11, "prfd", uimm5s8, int_aarch64_sve_gather_prfd>;

		defm ADR_SXTW_ZZZ_D : sve_int_bin_cons_misc_0_a_sxtw<0b00, "adr">;
		defm ADR_UXTW_ZZZ_D : sve_int_bin_cons_misc_0_a_uxtw<0b01, "adr">;

llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h

+20 −0

Original line number	Diff line number	Diff line
		@@ -840,6 +840,26 @@ inline static bool isAnyMOVWMovAlias(uint64_t Value, int RegWidth) {
		return isAnyMOVZMovAlias(Value, RegWidth);
		}

		/// Check if the value of \p OffsetInBytes can be used as an immediate for
		/// the gather load/prefetch and scatter store instructions with vector base and
		/// immediate offset addressing mode:
		///
		/// [<Zn>.[S\|D]{, #<imm>}]
		///
		/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
		static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes,
		unsigned ScalarSizeInBytes) {
		// The immediate is not a multiple of the scalar size.
		if (OffsetInBytes % ScalarSizeInBytes)
		return false;

		// The immediate is out of range.
		if (OffsetInBytes / ScalarSizeInBytes > 31)
		return false;

		return true;
		}

		} // end namespace AArch64_AM

		} // end namespace llvm

llvm/lib/Target/AArch64/SVEInstrFormats.td

+32 −5

Original line number	Diff line number	Diff line
		@@ -6455,9 +6455,17 @@ class sve_mem_32b_prfm_sv<bits<2> msz, bit xs, string asm,

		multiclass sve_mem_32b_prfm_sv_scaled<bits<2> msz, string asm,
		RegisterOperand sxtw_opnd,
		RegisterOperand uxtw_opnd> {
		RegisterOperand uxtw_opnd,
		PatFrag op_sxtw,
		PatFrag op_uxtw> {
		def _UXTW_SCALED : sve_mem_32b_prfm_sv<msz, 0, asm, uxtw_opnd>;
		def _SXTW_SCALED : sve_mem_32b_prfm_sv<msz, 1, asm, sxtw_opnd>;

		def : Pat<(op_uxtw (nxv4i1 PPR3bAny:$Pg), (i64 GPR64sp:$Rn), (nxv4i32 uxtw_opnd:$Zm), (i32 sve_prfop:$prfop)),
		(!cast<Instruction>(NAME # _UXTW_SCALED) sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm)>;

		def : Pat<(op_sxtw (nxv4i1 PPR3bAny:$Pg), (i64 GPR64sp:$Rn), (nxv4i32 sxtw_opnd:$Zm), (i32 sve_prfop:$prfop)),
		(!cast<Instruction>(NAME # _SXTW_SCALED) sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm)>;
		}

		class sve_mem_32b_prfm_vi<bits<2> msz, string asm, Operand imm_ty>
		@@ -6480,11 +6488,14 @@ class sve_mem_32b_prfm_vi<bits<2> msz, string asm, Operand imm_ty>
		let Inst{3-0} = prfop;
		}

		multiclass sve_mem_32b_prfm_vi<bits<2> msz, string asm, Operand imm_ty> {
		multiclass sve_mem_32b_prfm_vi<bits<2> msz, string asm, Operand imm_ty, SDPatternOperator op> {
		def NAME : sve_mem_32b_prfm_vi<msz, asm, imm_ty>;

		def : InstAlias<asm # "\t$prfop, $Pg, [$Zn]",
		(!cast<Instruction>(NAME) sve_prfop:$prfop, PPR3bAny:$Pg, ZPR32:$Zn, 0), 1>;

		def : Pat<(op (nxv4i1 PPR_3b:$Pg), (nxv4i32 ZPR32:$Zn), (i64 imm_ty:$imm), (i32 sve_prfop:$prfop)),
		(!cast<Instruction>(NAME) sve_prfop:$prfop, PPR_3b:$Pg, ZPR32:$Zn, imm_ty:$imm)>;
		}

		class sve_mem_z_fill<string asm>
		@@ -6798,14 +6809,27 @@ class sve_mem_64b_prfm_sv<bits<2> msz, bit xs, bit lsl, string asm,

		multiclass sve_mem_64b_prfm_sv_ext_scaled<bits<2> msz, string asm,
		RegisterOperand sxtw_opnd,
		RegisterOperand uxtw_opnd> {
		RegisterOperand uxtw_opnd,
		PatFrag op_sxtw,
		PatFrag op_uxtw> {
		def _UXTW_SCALED : sve_mem_64b_prfm_sv<msz, 0, 0, asm, uxtw_opnd>;
		def _SXTW_SCALED : sve_mem_64b_prfm_sv<msz, 1, 0, asm, sxtw_opnd>;

		def : Pat<(op_uxtw (nxv2i1 PPR3bAny:$Pg), (i64 GPR64sp:$Rn), (nxv2i64 uxtw_opnd:$Zm), (i32 sve_prfop:$prfop)),
		(!cast<Instruction>(NAME # _UXTW_SCALED) sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm)>;

		def : Pat<(op_sxtw (nxv2i1 PPR3bAny:$Pg), (i64 GPR64sp:$Rn), (nxv2i64 sxtw_opnd:$Zm), (i32 sve_prfop:$prfop)),
		(!cast<Instruction>(NAME # _SXTW_SCALED) sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm)>;

		}

		multiclass sve_mem_64b_prfm_sv_lsl_scaled<bits<2> msz, string asm,
		RegisterOperand zprext> {
		RegisterOperand zprext, PatFrag frag> {
		def NAME : sve_mem_64b_prfm_sv<msz, 1, 1, asm, zprext>;

		def : Pat<(frag (nxv2i1 PPR3bAny:$Pg), (i64 GPR64sp:$Rn), (nxv2i64 zprext:$Zm), (i32 sve_prfop:$prfop)),
		(!cast<Instruction>(NAME) sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm)>;

		}


		@@ -6831,11 +6855,14 @@ class sve_mem_64b_prfm_vi<bits<2> msz, string asm, Operand imm_ty>
		let hasSideEffects = 1;
		}

		multiclass sve_mem_64b_prfm_vi<bits<2> msz, string asm, Operand imm_ty> {
		multiclass sve_mem_64b_prfm_vi<bits<2> msz, string asm, Operand imm_ty, SDPatternOperator op> {
		def NAME : sve_mem_64b_prfm_vi<msz, asm, imm_ty>;

		def : InstAlias<asm # "\t$prfop, $Pg, [$Zn]",
		(!cast<Instruction>(NAME) sve_prfop:$prfop, PPR3bAny:$Pg, ZPR64:$Zn, 0), 1>;

		def : Pat<(op (nxv2i1 PPR_3b:$Pg), (nxv2i64 ZPR32:$Zn), (i64 imm_ty:$imm), (i32 sve_prfop:$prfop)),
		(!cast<Instruction>(NAME) sve_prfop:$prfop, PPR_3b:$Pg, ZPR32:$Zn, imm_ty:$imm)>;
		}

		//===----------------------------------------------------------------------===//