Commit 404da13e authored by Andrzej Warzynski's avatar Andrzej Warzynski
Browse files

[AArch64][SVE] Gather loads: pass 32 bit unpacked offsets as nxv2i32

Summary:
Currently 32 bit unpacked offsets are passed as nxv2i64. However, as
pointed out in https://reviews.llvm.org/D71074, using nxv2i32 instead
would improve consistency with:
  * how other arguments are treated
  * how scatter stores are implemented
This patch makes sure that 32 bit unpacked offsets are passes as nxv2i32
instead of nxv2i64.

Reviewers: sdesmalen, efriedma

Subscribers: tschuett, kristof.beyls, hiraditya, rkruppe, psnobl, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D71724
parent 535b3c6b
Loading
Loading
Loading
Loading
+2 −1
Original line number Diff line number Diff line
@@ -1114,7 +1114,8 @@ class AdvSIMD_GatherLoad_32bitOffset_Intrinsic
    : Intrinsic<[llvm_anyvector_ty],
                [
                  LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                  LLVMPointerToElt<0>, llvm_anyvector_ty
                  LLVMPointerToElt<0>,
                  LLVMScalarOrSameVectorWidth<0, llvm_i32_ty>
                ],
                [IntrReadMem, IntrArgMemOnly]>;

+21 −14
Original line number Diff line number Diff line
@@ -12231,18 +12231,14 @@ static SDValue performST1ScatterCombine(SDNode *N, SelectionDAG &DAG,
}

static SDValue performLD1GatherCombine(SDNode *N, SelectionDAG &DAG,
                                       unsigned Opcode) {
                                       unsigned Opcode,
                                       bool OnlyPackedOffsets = true) {
  EVT RetVT = N->getValueType(0);
  assert(RetVT.isScalableVector() &&
         "Gather loads are only possible for SVE vectors");

  SDLoc DL(N);
  MVT RetElVT = RetVT.getVectorElementType().getSimpleVT();
  unsigned NumElements = AArch64::SVEBitsPerBlock / RetElVT.getSizeInBits();

  EVT MaxVT = llvm::MVT::getScalableVectorVT(RetElVT, NumElements);
  if (RetVT.getSizeInBits().getKnownMinSize() >
      MaxVT.getSizeInBits().getKnownMinSize())
  if (RetVT.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock)
    return SDValue();

  // Depending on the addressing mode, this is either a pointer or a vector of
@@ -12250,12 +12246,19 @@ static SDValue performLD1GatherCombine(SDNode *N, SelectionDAG &DAG,
  const SDValue Base = N->getOperand(3);
  // Depending on the addressing mode, this is either a single offset or a
  // vector of offsets  (that fits into one register)
  const SDValue Offset = N->getOperand(4);
  SDValue Offset = N->getOperand(4);

  if (!DAG.getTargetLoweringInfo().isTypeLegal(Base.getValueType()) ||
      !DAG.getTargetLoweringInfo().isTypeLegal(Offset.getValueType()))
  auto &TLI = DAG.getTargetLoweringInfo();
  if (!TLI.isTypeLegal(Base.getValueType()))
    return SDValue();

  // Some gather load variants allow unpacked offsets, but only as nxv2i32
  // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
  // nxv2i64. Legalize accordingly.
  if (!OnlyPackedOffsets &&
      Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
    Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);

  // Return value type that is representable in hardware
  EVT HwRetVt = getSVEContainerType(RetVT);

@@ -12439,13 +12442,17 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
    case Intrinsic::aarch64_sve_ld1_gather_index:
      return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_SCALED);
    case Intrinsic::aarch64_sve_ld1_gather_sxtw:
      return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_SXTW);
      return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_SXTW,
                                      /*OnlyPackedOffsets=*/false);
    case Intrinsic::aarch64_sve_ld1_gather_uxtw:
      return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_UXTW);
      return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_UXTW,
                                      /*OnlyPackedOffsets=*/false);
    case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
      return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_SXTW_SCALED);
      return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_SXTW_SCALED,
                                      /*OnlyPackedOffsets=*/false);
    case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
      return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_UXTW_SCALED);
      return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_UXTW_SCALED,
                                      /*OnlyPackedOffsets=*/false);
    case Intrinsic::aarch64_sve_ld1_gather_imm:
      return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_IMM);
    case Intrinsic::aarch64_sve_st1_scatter:
+86 −86
Original line number Diff line number Diff line
@@ -11,7 +11,7 @@ define <vscale x 4 x i32> @gld1h_s_uxtw_index(<vscale x 4 x i1> %pg, i16* %base,
; CHECK-LABEL: gld1h_s_uxtw_index:
; CHECK: ld1h { z0.s }, p0/z, [x0, z0.s, uxtw #1]
; CHECK-NEXT:	ret
  %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4i16.nxv4i32(<vscale x 4 x i1> %pg,
  %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4i16(<vscale x 4 x i1> %pg,
                                                                                  i16* %base,
                                                                                  <vscale x 4 x i32> %b)
  %res = zext <vscale x 4 x i16> %load to <vscale x 4 x i32>
@@ -22,31 +22,31 @@ define <vscale x 4 x i32> @gld1h_s_sxtw_index(<vscale x 4 x i1> %pg, i16* %base,
; CHECK-LABEL: gld1h_s_sxtw_index:
; CHECK: ld1h { z0.s }, p0/z, [x0, z0.s, sxtw #1]
; CHECK-NEXT: ret
  %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4i16.nxv4i32(<vscale x 4 x i1> %pg,
  %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4i16(<vscale x 4 x i1> %pg,
                                                                                  i16* %base,
                                                                                  <vscale x 4 x i32> %b)
  %res = zext <vscale x 4 x i16> %load to <vscale x 4 x i32>
  ret <vscale x 4 x i32> %res
}

define <vscale x 2 x i64> @gld1h_d_uxtw_index(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i64> %b) {
define <vscale x 2 x i64> @gld1h_d_uxtw_index(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i32> %b) {
; CHECK-LABEL: gld1h_d_uxtw_index:
; CHECK: ld1h { z0.d }, p0/z, [x0, z0.d, uxtw #1]
; CHECK-NEXT: ret
  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2i16.nxv2i64(<vscale x 2 x i1> %pg,
  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2i16(<vscale x 2 x i1> %pg,
                                                                                  i16* %base,
                                                                                          <vscale x 2 x i64> %b)
                                                                                  <vscale x 2 x i32> %b)
  %res = zext <vscale x 2 x i16> %load to <vscale x 2 x i64>
  ret <vscale x 2 x i64> %res
}

define <vscale x 2 x i64> @gld1h_d_sxtw_index(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i64> %b) {
define <vscale x 2 x i64> @gld1h_d_sxtw_index(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i32> %b) {
; CHECK-LABEL: gld1h_d_sxtw_index:
; CHECK: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1]
; CHECK-NEXT: ret
  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2i16.nxv2i64(<vscale x 2 x i1> %pg,
  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2i16(<vscale x 2 x i1> %pg,
                                                                                  i16* %base,
                                                                                          <vscale x 2 x i64> %b)
                                                                                  <vscale x 2 x i32> %b)
  %res = zext <vscale x 2 x i16> %load to <vscale x 2 x i64>
  ret <vscale x 2 x i64> %res
}
@@ -56,7 +56,7 @@ define <vscale x 4 x i32> @gld1w_s_uxtw_index(<vscale x 4 x i1> %pg, i32* %base,
; CHECK-LABEL: gld1w_s_uxtw_index:
; CHECK: ld1w { z0.s }, p0/z, [x0, z0.s, uxtw #2]
; CHECK-NEXT:	ret
  %load = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4i32.nxv4i32(<vscale x 4 x i1> %pg,
  %load = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4i32(<vscale x 4 x i1> %pg,
                                                                                  i32* %base,
                                                                                  <vscale x 4 x i32> %b)
  ret <vscale x 4 x i32> %load
@@ -66,30 +66,30 @@ define <vscale x 4 x i32> @gld1w_s_sxtw_index(<vscale x 4 x i1> %pg, i32* %base,
; CHECK-LABEL: gld1w_s_sxtw_index:
; CHECK: ld1w { z0.s }, p0/z, [x0, z0.s, sxtw #2]
; CHECK-NEXT: ret
  %load = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4i32.nxv4i32(<vscale x 4 x i1> %pg,
  %load = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4i32(<vscale x 4 x i1> %pg,
                                                                                  i32* %base,
                                                                                  <vscale x 4 x i32> %b)
  ret <vscale x 4 x i32> %load
}

define <vscale x 2 x i64> @gld1w_d_uxtw_index(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i64> %b) {
define <vscale x 2 x i64> @gld1w_d_uxtw_index(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i32> %b) {
; CHECK-LABEL: gld1w_d_uxtw_index:
; CHECK: ld1w { z0.d }, p0/z, [x0, z0.d, uxtw #2]
; CHECK-NEXT: ret
  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2i32.nxv2i64(<vscale x 2 x i1> %pg,
  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2i32(<vscale x 2 x i1> %pg,
                                                                                  i32* %base,
                                                                                          <vscale x 2 x i64> %b)
                                                                                  <vscale x 2 x i32> %b)
  %res = zext <vscale x 2 x i32> %load to <vscale x 2 x i64>
  ret <vscale x 2 x i64> %res
}

define <vscale x 2 x i64> @gld1w_d_sxtw_index(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i64> %b) {
define <vscale x 2 x i64> @gld1w_d_sxtw_index(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i32> %b) {
; CHECK-LABEL: gld1w_d_sxtw_index:
; CHECK: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2]
; CHECK-NEXT: ret
  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2i32.nxv2i64(<vscale x 2 x i1> %pg,
  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2i32(<vscale x 2 x i1> %pg,
                                                                                  i32* %base,
                                                                                          <vscale x 2 x i64> %b)
                                                                                  <vscale x 2 x i32> %b)
  %res = zext <vscale x 2 x i32> %load to <vscale x 2 x i64>
  ret <vscale x 2 x i64> %res
}
@@ -98,7 +98,7 @@ define <vscale x 4 x float> @gld1w_s_uxtw_index_float(<vscale x 4 x i1> %pg, flo
; CHECK-LABEL: gld1w_s_uxtw_index_float:
; CHECK: ld1w { z0.s }, p0/z, [x0, z0.s, uxtw #2]
; CHECK-NEXT:	ret
  %load = call <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4f32.nxv4i32(<vscale x 4 x i1> %pg,
  %load = call <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4f32(<vscale x 4 x i1> %pg,
                                                                                    float* %base,
                                                                                    <vscale x 4 x i32> %b)
  ret <vscale x 4 x float> %load
@@ -108,50 +108,50 @@ define <vscale x 4 x float> @gld1w_s_sxtw_index_float(<vscale x 4 x i1> %pg, flo
; CHECK-LABEL: gld1w_s_sxtw_index_float:
; CHECK: ld1w { z0.s }, p0/z, [x0, z0.s, sxtw #2]
; CHECK-NEXT:	ret
  %load = call <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4f32.nxv4i32(<vscale x 4 x i1> %pg,
  %load = call <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4f32(<vscale x 4 x i1> %pg,
                                                                                    float* %base,
                                                                                    <vscale x 4 x i32> %b)
  ret <vscale x 4 x float> %load
}

; LD1D
define <vscale x 2 x i64> @gld1d_s_uxtw_index(<vscale x 2 x i1> %pg, i64* %base, <vscale x 2 x i64> %b) {
define <vscale x 2 x i64> @gld1d_s_uxtw_index(<vscale x 2 x i1> %pg, i64* %base, <vscale x 2 x i32> %b) {
; CHECK-LABEL: gld1d_s_uxtw_index:
; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, uxtw #3]
; CHECK-NEXT:	ret
  %load = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2i64.nxv2i64(<vscale x 2 x i1> %pg,
  %load = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2i64(<vscale x 2 x i1> %pg,
                                                                                  i64* %base,
                                                                                          <vscale x 2 x i64> %b)
                                                                                  <vscale x 2 x i32> %b)
  ret <vscale x 2 x i64> %load
}

define <vscale x 2 x i64> @gld1d_sxtw_index(<vscale x 2 x i1> %pg, i64* %base, <vscale x 2 x i64> %b) {
define <vscale x 2 x i64> @gld1d_sxtw_index(<vscale x 2 x i1> %pg, i64* %base, <vscale x 2 x i32> %b) {
; CHECK-LABEL: gld1d_sxtw_index:
; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, sxtw #3]
; CHECK-NEXT:	ret
  %load = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2i64.nxv2i64(<vscale x 2 x i1> %pg,
  %load = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2i64(<vscale x 2 x i1> %pg,
                                                                                  i64* %base,
                                                                                          <vscale x 2 x i64> %b)
                                                                                  <vscale x 2 x i32> %b)
  ret <vscale x 2 x i64> %load
}

define <vscale x 2 x double> @gld1d_uxtw_index_double(<vscale x 2 x i1> %pg, double* %base, <vscale x 2 x i64> %b) {
define <vscale x 2 x double> @gld1d_uxtw_index_double(<vscale x 2 x i1> %pg, double* %base, <vscale x 2 x i32> %b) {
; CHECK-LABEL: gld1d_uxtw_index_double:
; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, uxtw #3]
; CHECK-NEXT:	ret
  %load = call <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2f64.nxv2i64(<vscale x 2 x i1> %pg,
  %load = call <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2f64(<vscale x 2 x i1> %pg,
                                                                                     double* %base,
                                                                                             <vscale x 2 x i64> %b)
                                                                                     <vscale x 2 x i32> %b)
  ret <vscale x 2 x double> %load
}

define <vscale x 2 x double> @gld1d_sxtw_index_double(<vscale x 2 x i1> %pg, double* %base, <vscale x 2 x i64> %b) {
define <vscale x 2 x double> @gld1d_sxtw_index_double(<vscale x 2 x i1> %pg, double* %base, <vscale x 2 x i32> %b) {
; CHECK-LABEL: gld1d_sxtw_index_double:
; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, sxtw #3]
; CHECK-NEXT:	ret
  %load = call <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2f64.nxv2i64(<vscale x 2 x i1> %pg,
  %load = call <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2f64(<vscale x 2 x i1> %pg,
                                                                                     double* %base,
                                                                                             <vscale x 2 x i64> %b)
                                                                                     <vscale x 2 x i32> %b)
  ret <vscale x 2 x double> %load
}

@@ -166,7 +166,7 @@ define <vscale x 4 x i32> @gld1sh_s_uxtw_index(<vscale x 4 x i1> %pg, i16* %base
; CHECK-LABEL: gld1sh_s_uxtw_index:
; CHECK: ld1sh { z0.s }, p0/z, [x0, z0.s, uxtw #1]
; CHECK-NEXT:	ret
  %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4i16.nxv4i32(<vscale x 4 x i1> %pg,
  %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4i16(<vscale x 4 x i1> %pg,
                                                                                  i16* %base,
                                                                                  <vscale x 4 x i32> %b)
  %res = sext <vscale x 4 x i16> %load to <vscale x 4 x i32>
@@ -177,79 +177,79 @@ define <vscale x 4 x i32> @gld1sh_s_sxtw_index(<vscale x 4 x i1> %pg, i16* %base
; CHECK-LABEL: gld1sh_s_sxtw_index:
; CHECK: ld1sh { z0.s }, p0/z, [x0, z0.s, sxtw #1]
; CHECK-NEXT: ret
  %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4i16.nxv4i32(<vscale x 4 x i1> %pg,
  %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4i16(<vscale x 4 x i1> %pg,
                                                                                  i16* %base,
                                                                                  <vscale x 4 x i32> %b)
  %res = sext <vscale x 4 x i16> %load to <vscale x 4 x i32>
  ret <vscale x 4 x i32> %res
}

define <vscale x 2 x i64> @gld1sh_d_uxtw_index(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i64> %b) {
define <vscale x 2 x i64> @gld1sh_d_uxtw_index(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i32> %b) {
; CHECK-LABEL: gld1sh_d_uxtw_index:
; CHECK: ld1sh { z0.d }, p0/z, [x0, z0.d, uxtw #1]
; CHECK-NEXT: ret
  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2i16.nxv2i64(<vscale x 2 x i1> %pg,
  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2i16(<vscale x 2 x i1> %pg,
                                                                                  i16* %base,
                                                                                          <vscale x 2 x i64> %b)
                                                                                  <vscale x 2 x i32> %b)
  %res = sext <vscale x 2 x i16> %load to <vscale x 2 x i64>
  ret <vscale x 2 x i64> %res
}

define <vscale x 2 x i64> @gld1sh_d_sxtw_index(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i64> %b) {
define <vscale x 2 x i64> @gld1sh_d_sxtw_index(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i32> %b) {
; CHECK-LABEL: gld1sh_d_sxtw_index:
; CHECK: ld1sh { z0.d }, p0/z, [x0, z0.d, sxtw #1]
; CHECK-NEXT: ret
  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2i16.nxv2i64(<vscale x 2 x i1> %pg,
  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2i16(<vscale x 2 x i1> %pg,
                                                                                  i16* %base,
                                                                                          <vscale x 2 x i64> %b)
                                                                                  <vscale x 2 x i32> %b)
  %res = sext <vscale x 2 x i16> %load to <vscale x 2 x i64>
  ret <vscale x 2 x i64> %res
}

; LD1SW
define <vscale x 2 x i64> @gld1sw_d_uxtw_index(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i64> %b) {
define <vscale x 2 x i64> @gld1sw_d_uxtw_index(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i32> %b) {
; CHECK-LABEL: gld1sw_d_uxtw_index:
; CHECK: ld1sw { z0.d }, p0/z, [x0, z0.d, uxtw #2]
; CHECK-NEXT: ret
  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2i32.nxv2i64(<vscale x 2 x i1> %pg,
  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2i32(<vscale x 2 x i1> %pg,
                                                                                  i32* %base,
                                                                                          <vscale x 2 x i64> %b)
                                                                                  <vscale x 2 x i32> %b)
  %res = sext <vscale x 2 x i32> %load to <vscale x 2 x i64>
  ret <vscale x 2 x i64> %res
}

define <vscale x 2 x i64> @gld1sw_d_sxtw_index(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i64> %b) {
define <vscale x 2 x i64> @gld1sw_d_sxtw_index(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i32> %b) {
; CHECK-LABEL: gld1sw_d_sxtw_index:
; CHECK: ld1sw { z0.d }, p0/z, [x0, z0.d, sxtw #2]
; CHECK-NEXT: ret
  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2i32.nxv2i64(<vscale x 2 x i1> %pg,
  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2i32(<vscale x 2 x i1> %pg,
                                                                                  i32* %base,
                                                                                          <vscale x 2 x i64> %b)
                                                                                  <vscale x 2 x i32> %b)
  %res = sext <vscale x 2 x i32> %load to <vscale x 2 x i64>
  ret <vscale x 2 x i64> %res
}


; LD1H/LD1SH
declare <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4i16.nxv4i32(<vscale x 4 x i1>, i16*, <vscale x 4 x i32>)
declare <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4i16.nxv4i32(<vscale x 4 x i1>, i16*, <vscale x 4 x i32>)
declare <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4i16(<vscale x 4 x i1>, i16*, <vscale x 4 x i32>)
declare <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4i16(<vscale x 4 x i1>, i16*, <vscale x 4 x i32>)

declare <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2i16.nxv2i64(<vscale x 2 x i1>, i16*, <vscale x 2 x i64>)
declare <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2i16.nxv2i64(<vscale x 2 x i1>, i16*, <vscale x 2 x i64>)
declare <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2i16(<vscale x 2 x i1>, i16*, <vscale x 2 x i32>)
declare <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2i16(<vscale x 2 x i1>, i16*, <vscale x 2 x i32>)

; LD1W/LD1SW
declare <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4i32.nxv4i32(<vscale x 4 x i1>, i32*, <vscale x 4 x i32>)
declare <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4i32.nxv4i32(<vscale x 4 x i1>, i32*, <vscale x 4 x i32>)
declare <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4i32(<vscale x 4 x i1>, i32*, <vscale x 4 x i32>)
declare <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4i32(<vscale x 4 x i1>, i32*, <vscale x 4 x i32>)

declare <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2i32.nxv2i64(<vscale x 2 x i1>, i32*, <vscale x 2 x i64>)
declare <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2i32.nxv2i64(<vscale x 2 x i1>, i32*, <vscale x 2 x i64>)
declare <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2i32(<vscale x 2 x i1>, i32*, <vscale x 2 x i32>)
declare <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2i32(<vscale x 2 x i1>, i32*, <vscale x 2 x i32>)

declare <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4f32.nxv4i32(<vscale x 4 x i1>, float*, <vscale x 4 x i32>)
declare <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4f32.nxv4i32(<vscale x 4 x i1>, float*, <vscale x 4 x i32>)
declare <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4f32(<vscale x 4 x i1>, float*, <vscale x 4 x i32>)
declare <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4f32(<vscale x 4 x i1>, float*, <vscale x 4 x i32>)

; LD1D
declare <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2i64.nxv2i64(<vscale x 2 x i1>, i64*, <vscale x 2 x i64>)
declare <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2i64.nxv2i64(<vscale x 2 x i1>, i64*, <vscale x 2 x i64>)
declare <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2i64(<vscale x 2 x i1>, i64*, <vscale x 2 x i32>)
declare <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2i64(<vscale x 2 x i1>, i64*, <vscale x 2 x i32>)

declare <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2f64.nxv2i64(<vscale x 2 x i1>, double*, <vscale x 2 x i64>)
declare <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2f64.nxv2i64(<vscale x 2 x i1>, double*, <vscale x 2 x i64>)
declare <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2f64(<vscale x 2 x i1>, double*, <vscale x 2 x i32>)
declare <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2f64(<vscale x 2 x i1>, double*, <vscale x 2 x i32>)
+118 −118

File changed.

Preview size limit exceeded, changes collapsed.