[AArch64][SVE] Gather loads: pass 32 bit unpacked offsets as nxv2i32 (404da13e) · Commits · Cabrera, Anthony / llvm-project

llvm/include/llvm/IR/IntrinsicsAArch64.td

+2 −1

Original line number	Diff line number	Diff line
		@@ -1114,7 +1114,8 @@ class AdvSIMD_GatherLoad_32bitOffset_Intrinsic
		: Intrinsic<[llvm_anyvector_ty],
		[
		LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
		LLVMPointerToElt<0>, llvm_anyvector_ty
		LLVMPointerToElt<0>,
		LLVMScalarOrSameVectorWidth<0, llvm_i32_ty>
		],
		[IntrReadMem, IntrArgMemOnly]>;

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+21 −14

Original line number	Diff line number	Diff line
		@@ -12231,18 +12231,14 @@ static SDValue performST1ScatterCombine(SDNode *N, SelectionDAG &DAG,
		}

		static SDValue performLD1GatherCombine(SDNode *N, SelectionDAG &DAG,
		unsigned Opcode) {
		unsigned Opcode,
		bool OnlyPackedOffsets = true) {
		EVT RetVT = N->getValueType(0);
		assert(RetVT.isScalableVector() &&
		"Gather loads are only possible for SVE vectors");

		SDLoc DL(N);
		MVT RetElVT = RetVT.getVectorElementType().getSimpleVT();
		unsigned NumElements = AArch64::SVEBitsPerBlock / RetElVT.getSizeInBits();

		EVT MaxVT = llvm::MVT::getScalableVectorVT(RetElVT, NumElements);
		if (RetVT.getSizeInBits().getKnownMinSize() >
		MaxVT.getSizeInBits().getKnownMinSize())
		if (RetVT.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock)
		return SDValue();

		// Depending on the addressing mode, this is either a pointer or a vector of
		@@ -12250,12 +12246,19 @@ static SDValue performLD1GatherCombine(SDNode *N, SelectionDAG &DAG,
		const SDValue Base = N->getOperand(3);
		// Depending on the addressing mode, this is either a single offset or a
		// vector of offsets (that fits into one register)
		const SDValue Offset = N->getOperand(4);
		SDValue Offset = N->getOperand(4);

		if (!DAG.getTargetLoweringInfo().isTypeLegal(Base.getValueType()) \|\|
		!DAG.getTargetLoweringInfo().isTypeLegal(Offset.getValueType()))
		auto &TLI = DAG.getTargetLoweringInfo();
		if (!TLI.isTypeLegal(Base.getValueType()))
		return SDValue();

		// Some gather load variants allow unpacked offsets, but only as nxv2i32
		// vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
		// nxv2i64. Legalize accordingly.
		if (!OnlyPackedOffsets &&
		Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
		Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);

		// Return value type that is representable in hardware
		EVT HwRetVt = getSVEContainerType(RetVT);

		@@ -12439,13 +12442,17 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
		case Intrinsic::aarch64_sve_ld1_gather_index:
		return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_SCALED);
		case Intrinsic::aarch64_sve_ld1_gather_sxtw:
		return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_SXTW);
		return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_SXTW,
		/OnlyPackedOffsets=/false);
		case Intrinsic::aarch64_sve_ld1_gather_uxtw:
		return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_UXTW);
		return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_UXTW,
		/OnlyPackedOffsets=/false);
		case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
		return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_SXTW_SCALED);
		return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_SXTW_SCALED,
		/OnlyPackedOffsets=/false);
		case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
		return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_UXTW_SCALED);
		return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_UXTW_SCALED,
		/OnlyPackedOffsets=/false);
		case Intrinsic::aarch64_sve_ld1_gather_imm:
		return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_IMM);
		case Intrinsic::aarch64_sve_st1_scatter:

llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-32bit-scaled-offsets.ll

+86 −86

Original line number	Diff line number	Diff line
		@@ -11,7 +11,7 @@ define <vscale x 4 x i32> @gld1h_s_uxtw_index(<vscale x 4 x i1> %pg, i16* %base,
		; CHECK-LABEL: gld1h_s_uxtw_index:
		; CHECK: ld1h { z0.s }, p0/z, [x0, z0.s, uxtw #1]
		; CHECK-NEXT: ret
		%load = call <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4i16.nxv4i32(<vscale x 4 x i1> %pg,
		%load = call <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4i16(<vscale x 4 x i1> %pg,
		i16* %base,
		<vscale x 4 x i32> %b)
		%res = zext <vscale x 4 x i16> %load to <vscale x 4 x i32>
		@@ -22,31 +22,31 @@ define <vscale x 4 x i32> @gld1h_s_sxtw_index(<vscale x 4 x i1> %pg, i16* %base,
		; CHECK-LABEL: gld1h_s_sxtw_index:
		; CHECK: ld1h { z0.s }, p0/z, [x0, z0.s, sxtw #1]
		; CHECK-NEXT: ret
		%load = call <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4i16.nxv4i32(<vscale x 4 x i1> %pg,
		%load = call <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4i16(<vscale x 4 x i1> %pg,
		i16* %base,
		<vscale x 4 x i32> %b)
		%res = zext <vscale x 4 x i16> %load to <vscale x 4 x i32>
		ret <vscale x 4 x i32> %res
		}

		define <vscale x 2 x i64> @gld1h_d_uxtw_index(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i64> %b) {
		define <vscale x 2 x i64> @gld1h_d_uxtw_index(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i32> %b) {
		; CHECK-LABEL: gld1h_d_uxtw_index:
		; CHECK: ld1h { z0.d }, p0/z, [x0, z0.d, uxtw #1]
		; CHECK-NEXT: ret
		%load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2i16.nxv2i64(<vscale x 2 x i1> %pg,
		%load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2i16(<vscale x 2 x i1> %pg,
		i16* %base,
		<vscale x 2 x i64> %b)
		<vscale x 2 x i32> %b)
		%res = zext <vscale x 2 x i16> %load to <vscale x 2 x i64>
		ret <vscale x 2 x i64> %res
		}

		define <vscale x 2 x i64> @gld1h_d_sxtw_index(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i64> %b) {
		define <vscale x 2 x i64> @gld1h_d_sxtw_index(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i32> %b) {
		; CHECK-LABEL: gld1h_d_sxtw_index:
		; CHECK: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1]
		; CHECK-NEXT: ret
		%load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2i16.nxv2i64(<vscale x 2 x i1> %pg,
		%load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2i16(<vscale x 2 x i1> %pg,
		i16* %base,
		<vscale x 2 x i64> %b)
		<vscale x 2 x i32> %b)
		%res = zext <vscale x 2 x i16> %load to <vscale x 2 x i64>
		ret <vscale x 2 x i64> %res
		}
		@@ -56,7 +56,7 @@ define <vscale x 4 x i32> @gld1w_s_uxtw_index(<vscale x 4 x i1> %pg, i32* %base,
		; CHECK-LABEL: gld1w_s_uxtw_index:
		; CHECK: ld1w { z0.s }, p0/z, [x0, z0.s, uxtw #2]
		; CHECK-NEXT: ret
		%load = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4i32.nxv4i32(<vscale x 4 x i1> %pg,
		%load = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4i32(<vscale x 4 x i1> %pg,
		i32* %base,
		<vscale x 4 x i32> %b)
		ret <vscale x 4 x i32> %load
		@@ -66,30 +66,30 @@ define <vscale x 4 x i32> @gld1w_s_sxtw_index(<vscale x 4 x i1> %pg, i32* %base,
		; CHECK-LABEL: gld1w_s_sxtw_index:
		; CHECK: ld1w { z0.s }, p0/z, [x0, z0.s, sxtw #2]
		; CHECK-NEXT: ret
		%load = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4i32.nxv4i32(<vscale x 4 x i1> %pg,
		%load = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4i32(<vscale x 4 x i1> %pg,
		i32* %base,
		<vscale x 4 x i32> %b)
		ret <vscale x 4 x i32> %load
		}

		define <vscale x 2 x i64> @gld1w_d_uxtw_index(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i64> %b) {
		define <vscale x 2 x i64> @gld1w_d_uxtw_index(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i32> %b) {
		; CHECK-LABEL: gld1w_d_uxtw_index:
		; CHECK: ld1w { z0.d }, p0/z, [x0, z0.d, uxtw #2]
		; CHECK-NEXT: ret
		%load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2i32.nxv2i64(<vscale x 2 x i1> %pg,
		%load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2i32(<vscale x 2 x i1> %pg,
		i32* %base,
		<vscale x 2 x i64> %b)
		<vscale x 2 x i32> %b)
		%res = zext <vscale x 2 x i32> %load to <vscale x 2 x i64>
		ret <vscale x 2 x i64> %res
		}

		define <vscale x 2 x i64> @gld1w_d_sxtw_index(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i64> %b) {
		define <vscale x 2 x i64> @gld1w_d_sxtw_index(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i32> %b) {
		; CHECK-LABEL: gld1w_d_sxtw_index:
		; CHECK: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2]
		; CHECK-NEXT: ret
		%load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2i32.nxv2i64(<vscale x 2 x i1> %pg,
		%load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2i32(<vscale x 2 x i1> %pg,
		i32* %base,
		<vscale x 2 x i64> %b)
		<vscale x 2 x i32> %b)
		%res = zext <vscale x 2 x i32> %load to <vscale x 2 x i64>
		ret <vscale x 2 x i64> %res
		}
		@@ -98,7 +98,7 @@ define <vscale x 4 x float> @gld1w_s_uxtw_index_float(<vscale x 4 x i1> %pg, flo
		; CHECK-LABEL: gld1w_s_uxtw_index_float:
		; CHECK: ld1w { z0.s }, p0/z, [x0, z0.s, uxtw #2]
		; CHECK-NEXT: ret
		%load = call <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4f32.nxv4i32(<vscale x 4 x i1> %pg,
		%load = call <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4f32(<vscale x 4 x i1> %pg,
		float* %base,
		<vscale x 4 x i32> %b)
		ret <vscale x 4 x float> %load
		@@ -108,50 +108,50 @@ define <vscale x 4 x float> @gld1w_s_sxtw_index_float(<vscale x 4 x i1> %pg, flo
		; CHECK-LABEL: gld1w_s_sxtw_index_float:
		; CHECK: ld1w { z0.s }, p0/z, [x0, z0.s, sxtw #2]
		; CHECK-NEXT: ret
		%load = call <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4f32.nxv4i32(<vscale x 4 x i1> %pg,
		%load = call <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4f32(<vscale x 4 x i1> %pg,
		float* %base,
		<vscale x 4 x i32> %b)
		ret <vscale x 4 x float> %load
		}

		; LD1D
		define <vscale x 2 x i64> @gld1d_s_uxtw_index(<vscale x 2 x i1> %pg, i64* %base, <vscale x 2 x i64> %b) {
		define <vscale x 2 x i64> @gld1d_s_uxtw_index(<vscale x 2 x i1> %pg, i64* %base, <vscale x 2 x i32> %b) {
		; CHECK-LABEL: gld1d_s_uxtw_index:
		; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, uxtw #3]
		; CHECK-NEXT: ret
		%load = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2i64.nxv2i64(<vscale x 2 x i1> %pg,
		%load = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2i64(<vscale x 2 x i1> %pg,
		i64* %base,
		<vscale x 2 x i64> %b)
		<vscale x 2 x i32> %b)
		ret <vscale x 2 x i64> %load
		}

		define <vscale x 2 x i64> @gld1d_sxtw_index(<vscale x 2 x i1> %pg, i64* %base, <vscale x 2 x i64> %b) {
		define <vscale x 2 x i64> @gld1d_sxtw_index(<vscale x 2 x i1> %pg, i64* %base, <vscale x 2 x i32> %b) {
		; CHECK-LABEL: gld1d_sxtw_index:
		; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, sxtw #3]
		; CHECK-NEXT: ret
		%load = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2i64.nxv2i64(<vscale x 2 x i1> %pg,
		%load = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2i64(<vscale x 2 x i1> %pg,
		i64* %base,
		<vscale x 2 x i64> %b)
		<vscale x 2 x i32> %b)
		ret <vscale x 2 x i64> %load
		}

		define <vscale x 2 x double> @gld1d_uxtw_index_double(<vscale x 2 x i1> %pg, double* %base, <vscale x 2 x i64> %b) {
		define <vscale x 2 x double> @gld1d_uxtw_index_double(<vscale x 2 x i1> %pg, double* %base, <vscale x 2 x i32> %b) {
		; CHECK-LABEL: gld1d_uxtw_index_double:
		; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, uxtw #3]
		; CHECK-NEXT: ret
		%load = call <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2f64.nxv2i64(<vscale x 2 x i1> %pg,
		%load = call <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2f64(<vscale x 2 x i1> %pg,
		double* %base,
		<vscale x 2 x i64> %b)
		<vscale x 2 x i32> %b)
		ret <vscale x 2 x double> %load
		}

		define <vscale x 2 x double> @gld1d_sxtw_index_double(<vscale x 2 x i1> %pg, double* %base, <vscale x 2 x i64> %b) {
		define <vscale x 2 x double> @gld1d_sxtw_index_double(<vscale x 2 x i1> %pg, double* %base, <vscale x 2 x i32> %b) {
		; CHECK-LABEL: gld1d_sxtw_index_double:
		; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, sxtw #3]
		; CHECK-NEXT: ret
		%load = call <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2f64.nxv2i64(<vscale x 2 x i1> %pg,
		%load = call <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2f64(<vscale x 2 x i1> %pg,
		double* %base,
		<vscale x 2 x i64> %b)
		<vscale x 2 x i32> %b)
		ret <vscale x 2 x double> %load
		}

		@@ -166,7 +166,7 @@ define <vscale x 4 x i32> @gld1sh_s_uxtw_index(<vscale x 4 x i1> %pg, i16* %base
		; CHECK-LABEL: gld1sh_s_uxtw_index:
		; CHECK: ld1sh { z0.s }, p0/z, [x0, z0.s, uxtw #1]
		; CHECK-NEXT: ret
		%load = call <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4i16.nxv4i32(<vscale x 4 x i1> %pg,
		%load = call <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4i16(<vscale x 4 x i1> %pg,
		i16* %base,
		<vscale x 4 x i32> %b)
		%res = sext <vscale x 4 x i16> %load to <vscale x 4 x i32>
		@@ -177,79 +177,79 @@ define <vscale x 4 x i32> @gld1sh_s_sxtw_index(<vscale x 4 x i1> %pg, i16* %base
		; CHECK-LABEL: gld1sh_s_sxtw_index:
		; CHECK: ld1sh { z0.s }, p0/z, [x0, z0.s, sxtw #1]
		; CHECK-NEXT: ret
		%load = call <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4i16.nxv4i32(<vscale x 4 x i1> %pg,
		%load = call <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4i16(<vscale x 4 x i1> %pg,
		i16* %base,
		<vscale x 4 x i32> %b)
		%res = sext <vscale x 4 x i16> %load to <vscale x 4 x i32>
		ret <vscale x 4 x i32> %res
		}

		define <vscale x 2 x i64> @gld1sh_d_uxtw_index(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i64> %b) {
		define <vscale x 2 x i64> @gld1sh_d_uxtw_index(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i32> %b) {
		; CHECK-LABEL: gld1sh_d_uxtw_index:
		; CHECK: ld1sh { z0.d }, p0/z, [x0, z0.d, uxtw #1]
		; CHECK-NEXT: ret
		%load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2i16.nxv2i64(<vscale x 2 x i1> %pg,
		%load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2i16(<vscale x 2 x i1> %pg,
		i16* %base,
		<vscale x 2 x i64> %b)
		<vscale x 2 x i32> %b)
		%res = sext <vscale x 2 x i16> %load to <vscale x 2 x i64>
		ret <vscale x 2 x i64> %res
		}

		define <vscale x 2 x i64> @gld1sh_d_sxtw_index(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i64> %b) {
		define <vscale x 2 x i64> @gld1sh_d_sxtw_index(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i32> %b) {
		; CHECK-LABEL: gld1sh_d_sxtw_index:
		; CHECK: ld1sh { z0.d }, p0/z, [x0, z0.d, sxtw #1]
		; CHECK-NEXT: ret
		%load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2i16.nxv2i64(<vscale x 2 x i1> %pg,
		%load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2i16(<vscale x 2 x i1> %pg,
		i16* %base,
		<vscale x 2 x i64> %b)
		<vscale x 2 x i32> %b)
		%res = sext <vscale x 2 x i16> %load to <vscale x 2 x i64>
		ret <vscale x 2 x i64> %res
		}

		; LD1SW
		define <vscale x 2 x i64> @gld1sw_d_uxtw_index(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i64> %b) {
		define <vscale x 2 x i64> @gld1sw_d_uxtw_index(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i32> %b) {
		; CHECK-LABEL: gld1sw_d_uxtw_index:
		; CHECK: ld1sw { z0.d }, p0/z, [x0, z0.d, uxtw #2]
		; CHECK-NEXT: ret
		%load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2i32.nxv2i64(<vscale x 2 x i1> %pg,
		%load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2i32(<vscale x 2 x i1> %pg,
		i32* %base,
		<vscale x 2 x i64> %b)
		<vscale x 2 x i32> %b)
		%res = sext <vscale x 2 x i32> %load to <vscale x 2 x i64>
		ret <vscale x 2 x i64> %res
		}

		define <vscale x 2 x i64> @gld1sw_d_sxtw_index(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i64> %b) {
		define <vscale x 2 x i64> @gld1sw_d_sxtw_index(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i32> %b) {
		; CHECK-LABEL: gld1sw_d_sxtw_index:
		; CHECK: ld1sw { z0.d }, p0/z, [x0, z0.d, sxtw #2]
		; CHECK-NEXT: ret
		%load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2i32.nxv2i64(<vscale x 2 x i1> %pg,
		%load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2i32(<vscale x 2 x i1> %pg,
		i32* %base,
		<vscale x 2 x i64> %b)
		<vscale x 2 x i32> %b)
		%res = sext <vscale x 2 x i32> %load to <vscale x 2 x i64>
		ret <vscale x 2 x i64> %res
		}


		; LD1H/LD1SH
		declare <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4i16.nxv4i32(<vscale x 4 x i1>, i16*, <vscale x 4 x i32>)
		declare <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4i16.nxv4i32(<vscale x 4 x i1>, i16*, <vscale x 4 x i32>)
		declare <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4i16(<vscale x 4 x i1>, i16*, <vscale x 4 x i32>)
		declare <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4i16(<vscale x 4 x i1>, i16*, <vscale x 4 x i32>)

		declare <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2i16.nxv2i64(<vscale x 2 x i1>, i16*, <vscale x 2 x i64>)
		declare <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2i16.nxv2i64(<vscale x 2 x i1>, i16*, <vscale x 2 x i64>)
		declare <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2i16(<vscale x 2 x i1>, i16*, <vscale x 2 x i32>)
		declare <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2i16(<vscale x 2 x i1>, i16*, <vscale x 2 x i32>)

		; LD1W/LD1SW
		declare <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4i32.nxv4i32(<vscale x 4 x i1>, i32*, <vscale x 4 x i32>)
		declare <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4i32.nxv4i32(<vscale x 4 x i1>, i32*, <vscale x 4 x i32>)
		declare <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4i32(<vscale x 4 x i1>, i32*, <vscale x 4 x i32>)
		declare <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4i32(<vscale x 4 x i1>, i32*, <vscale x 4 x i32>)

		declare <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2i32.nxv2i64(<vscale x 2 x i1>, i32*, <vscale x 2 x i64>)
		declare <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2i32.nxv2i64(<vscale x 2 x i1>, i32*, <vscale x 2 x i64>)
		declare <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2i32(<vscale x 2 x i1>, i32*, <vscale x 2 x i32>)
		declare <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2i32(<vscale x 2 x i1>, i32*, <vscale x 2 x i32>)

		declare <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4f32.nxv4i32(<vscale x 4 x i1>, float*, <vscale x 4 x i32>)
		declare <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4f32.nxv4i32(<vscale x 4 x i1>, float*, <vscale x 4 x i32>)
		declare <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4f32(<vscale x 4 x i1>, float*, <vscale x 4 x i32>)
		declare <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4f32(<vscale x 4 x i1>, float*, <vscale x 4 x i32>)

		; LD1D
		declare <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2i64.nxv2i64(<vscale x 2 x i1>, i64*, <vscale x 2 x i64>)
		declare <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2i64.nxv2i64(<vscale x 2 x i1>, i64*, <vscale x 2 x i64>)
		declare <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2i64(<vscale x 2 x i1>, i64*, <vscale x 2 x i32>)
		declare <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2i64(<vscale x 2 x i1>, i64*, <vscale x 2 x i32>)

		declare <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2f64.nxv2i64(<vscale x 2 x i1>, double*, <vscale x 2 x i64>)
		declare <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2f64.nxv2i64(<vscale x 2 x i1>, double*, <vscale x 2 x i64>)
		declare <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2f64(<vscale x 2 x i1>, double*, <vscale x 2 x i32>)
		declare <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2f64(<vscale x 2 x i1>, double*, <vscale x 2 x i32>)

llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-32bit-unscaled-offsets.ll

+118 −118

File changed.

Preview size limit exceeded, changes collapsed.

Admin message