Commit 7e20c3a7 authored by Andrzej Warzynski's avatar Andrzej Warzynski
Browse files

[Aarch64][SVE] Add intrinsics for scatter stores

Summary:
This patch adds the following SVE intrinsics for scatter stores:
* 64-bit offsets:
  * @llvm.aarch64.sve.st1.scatter (unscaled)
  * @llvm.aarch64.sve.st1.scatter.index (scaled)
* 32-bit unscaled offsets:
  * @llvm.aarch64.sve.st1.scatter.uxtw (zero-extended offset)
  * @llvm.aarch64.sve.st1.scatter.sxtw (sign-extended-offset)
* 32-bit scaled offsets:
  * @llvm.aarch64.sve.st1.scatter.uxtw.index (zero-extended offset)
  * @llvm.aarch64.sve.st1.scatter.sxtw.index (sign-extended offset)
* vector base + immediate:
  * @llvm.aarch64.sve.st1.scatter.imm

Reviewers: rengolin, efriedma, sdesmalen

Reviewed By: efriedma, sdesmalen

Subscribers: kmclaughlin, eli.friedman, tschuett, kristof.beyls, hiraditya, rkruppe, psnobl, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D71074
parent 22caa3cf
Loading
Loading
Loading
Loading
+59 −0
Original line number Diff line number Diff line
@@ -1064,6 +1064,35 @@ class AdvSIMD_1VectorArg_Imm_Intrinsic
                 llvm_i32_ty],
                [IntrNoMem, ImmArg<1>]>;

class AdvSIMD_ScatterStore_64bitOffset_Intrinsic
    : Intrinsic<[],
               [
                 llvm_anyvector_ty,
                 LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                 LLVMPointerToElt<0>,
                 LLVMScalarOrSameVectorWidth<0, llvm_i64_ty>
               ],
               [IntrWriteMem, IntrArgMemOnly]>;

class AdvSIMD_ScatterStore_32bitOffset_Intrinsic
    : Intrinsic<[],
               [
                 llvm_anyvector_ty,
                 LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                 LLVMPointerToElt<0>,
                 LLVMScalarOrSameVectorWidth<0, llvm_i32_ty>
               ],
               [IntrWriteMem, IntrArgMemOnly]>;

class AdvSIMD_ScatterStore_VectorBase_Intrinsic
    : Intrinsic<[],
               [
                 llvm_anyvector_ty,
                 LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                 llvm_anyvector_ty, llvm_i64_ty
               ],
               [IntrWriteMem, IntrArgMemOnly, ImmArg<3>]>;

//
// Loads
//
@@ -1406,6 +1435,36 @@ def int_aarch64_sve_ld1_gather_uxtw_index : AdvSIMD_GatherLoad_32bitOffset_Intri
// vector base + immediate index
def int_aarch64_sve_ld1_gather_imm : AdvSIMD_GatherLoad_VecTorBase_Intrinsic;

//
// Scatter stores:
//

// scalar + vector, 64 bit unscaled offsets
def int_aarch64_sve_st1_scatter : AdvSIMD_ScatterStore_64bitOffset_Intrinsic;

// scalar + vector, 64 bit scaled offsets
def int_aarch64_sve_st1_scatter_index
    : AdvSIMD_ScatterStore_64bitOffset_Intrinsic;

//  scalar + vector, 32 bit unscaled offsets, sign (sxtw) or zero (zxtw)
//  extended to 64 bits
def int_aarch64_sve_st1_scatter_sxtw
    : AdvSIMD_ScatterStore_32bitOffset_Intrinsic;

def int_aarch64_sve_st1_scatter_uxtw
    : AdvSIMD_ScatterStore_32bitOffset_Intrinsic;

//  scalar + vector, 32 bit scaled offsets, sign (sxtw) or zero (zxtw) extended
//  to 64 bits
def int_aarch64_sve_st1_scatter_sxtw_index
    : AdvSIMD_ScatterStore_32bitOffset_Intrinsic;

def int_aarch64_sve_st1_scatter_uxtw_index
    : AdvSIMD_ScatterStore_32bitOffset_Intrinsic;

// vector base + immediate index
def int_aarch64_sve_st1_scatter_imm : AdvSIMD_ScatterStore_VectorBase_Intrinsic;

//
// SVE2 - Non-widening pairwise arithmetic
//
+94 −0
Original line number Diff line number Diff line
@@ -1357,6 +1357,13 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
  case AArch64ISD::GLD1S_SXTW_SCALED: return "AArch64ISD::GLD1S_SXTW_SCALED";
  case AArch64ISD::GLD1S_UXTW_SCALED: return "AArch64ISD::GLD1S_UXTW_SCALED";
  case AArch64ISD::GLD1S_IMM:         return "AArch64ISD::GLD1S_IMM";
  case AArch64ISD::SST1:              return "AArch64ISD::SST1";
  case AArch64ISD::SST1_SCALED:       return "AArch64ISD::SST1_SCALED";
  case AArch64ISD::SST1_SXTW:         return "AArch64ISD::SST1_SXTW";
  case AArch64ISD::SST1_UXTW:         return "AArch64ISD::SST1_UXTW";
  case AArch64ISD::SST1_SXTW_SCALED:  return "AArch64ISD::SST1_SXTW_SCALED";
  case AArch64ISD::SST1_UXTW_SCALED:  return "AArch64ISD::SST1_UXTW_SCALED";
  case AArch64ISD::SST1_IMM:          return "AArch64ISD::SST1_IMM";
  }
  return nullptr;
}
@@ -12080,6 +12087,75 @@ static MVT getSVEContainerType(EVT ContentTy) {
  }
}

static SDValue performST1ScatterCombine(SDNode *N, SelectionDAG &DAG,
                                        unsigned Opcode,
                                        bool OnlyPackedOffsets = true) {
  const SDValue Src = N->getOperand(2);
  const EVT SrcVT = Src->getValueType(0);
  assert(SrcVT.isScalableVector() &&
         "Scatter stores are only possible for SVE vectors");

  SDLoc DL(N);
  MVT SrcElVT = SrcVT.getVectorElementType().getSimpleVT();

  // Make sure that source data will fit into an SVE register
  if (SrcVT.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock)
    return SDValue();

  // For FPs, ACLE only supports _packed_ single and double precision types.
  if (SrcElVT.isFloatingPoint())
    if ((SrcVT != MVT::nxv4f32) && (SrcVT != MVT::nxv2f64))
      return SDValue();

  // Depending on the addressing mode, this is either a pointer or a vector of
  // pointers (that fits into one register)
  const SDValue Base = N->getOperand(4);
  // Depending on the addressing mode, this is either a single offset or a
  // vector of offsets  (that fits into one register)
  SDValue Offset = N->getOperand(5);

  auto &TLI = DAG.getTargetLoweringInfo();
  if (!TLI.isTypeLegal(Base.getValueType()))
    return SDValue();

  // Some scatter store variants allow unpacked offsets, but only as nxv2i32
  // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
  // nxv2i64. Legalize accordingly.
  if (!OnlyPackedOffsets &&
      Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
    Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);

  if (!TLI.isTypeLegal(Offset.getValueType()))
    return SDValue();

  // Source value type that is representable in hardware
  EVT HwSrcVt = getSVEContainerType(SrcVT);

  // Keep the original type of the input data to store - this is needed to
  // differentiate between ST1B, ST1H, ST1W and ST1D. For FP values we want the
  // integer equivalent, so just use HwSrcVt.
  SDValue InputVT = DAG.getValueType(SrcVT);
  if (SrcVT.isFloatingPoint())
    InputVT = DAG.getValueType(HwSrcVt);

  SDVTList VTs = DAG.getVTList(MVT::Other);
  SDValue SrcNew;

  if (Src.getValueType().isFloatingPoint())
    SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Src);
  else
    SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Src);

  SDValue Ops[] = {N->getOperand(0), // Chain
                   SrcNew,
                   N->getOperand(3), // Pg
                   Base,
                   Offset,
                   InputVT};

  return DAG.getNode(Opcode, DL, VTs, Ops);
}

static SDValue performLD1GatherCombine(SDNode *N, SelectionDAG &DAG,
                                       unsigned Opcode) {
  EVT RetVT = N->getValueType(0);
@@ -12300,6 +12376,24 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
      return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_UXTW_SCALED);
    case Intrinsic::aarch64_sve_ld1_gather_imm:
      return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_IMM);
    case Intrinsic::aarch64_sve_st1_scatter:
      return performST1ScatterCombine(N, DAG, AArch64ISD::SST1);
    case Intrinsic::aarch64_sve_st1_scatter_index:
      return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_SCALED);
    case Intrinsic::aarch64_sve_st1_scatter_sxtw:
      return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_SXTW,
                                      /*OnlyPackedOffsets=*/false);
    case Intrinsic::aarch64_sve_st1_scatter_uxtw:
      return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_UXTW,
                                      /*OnlyPackedOffsets=*/false);
    case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
      return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_SXTW_SCALED,
                                      /*OnlyPackedOffsets=*/false);
    case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
      return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_UXTW_SCALED,
                                      /*OnlyPackedOffsets=*/false);
    case Intrinsic::aarch64_sve_st1_scatter_imm:
      return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_IMM);
    default:
      break;
    }
+8 −0
Original line number Diff line number Diff line
@@ -223,6 +223,14 @@ enum NodeType : unsigned {
  GLD1S_UXTW_SCALED,
  GLD1S_SXTW_SCALED,
  GLD1S_IMM,
  // Scatter store
  SST1,
  SST1_SCALED,
  SST1_UXTW,
  SST1_SXTW,
  SST1_UXTW_SCALED,
  SST1_SXTW_SCALED,
  SST1_IMM,

  // NEON Load/Store with post-increment base updates
  LD2post = ISD::FIRST_TARGET_MEMORY_OPCODE,
+29 −0
Original line number Diff line number Diff line
@@ -393,6 +393,27 @@ def uimm5s8 : Operand<i64>, ImmLeaf<i64,
  let PrintMethod = "printImmScale<8>";
}

// tuimm5sN predicate - similiar to uimm5sN, but use TImmLeaf (TargetConstant)
// instead of ImmLeaf (Constant)
def tuimm5s2 : Operand<i64>, TImmLeaf<i64,
                [{ return Imm >= 0 && Imm < (32*2) && ((Imm % 2) == 0); }],
                UImmS2XForm> {
  let ParserMatchClass = UImm5s2Operand;
  let PrintMethod = "printImmScale<2>";
}
def tuimm5s4 : Operand<i64>, TImmLeaf<i64,
                [{ return Imm >= 0 && Imm < (32*4) && ((Imm % 4) == 0); }],
                UImmS4XForm> {
  let ParserMatchClass = UImm5s4Operand;
  let PrintMethod = "printImmScale<4>";
}
def tuimm5s8 : Operand<i64>, TImmLeaf<i64,
                [{ return Imm >= 0 && Imm < (32*8) && ((Imm % 8) == 0); }],
                UImmS8XForm> {
  let ParserMatchClass = UImm5s8Operand;
  let PrintMethod = "printImmScale<8>";
}

// uimm6sN predicate - True if the immediate is a multiple of N in the range
// [0 * N, 64 * N].
def UImm6s1Operand : UImmScaledMemoryIndexed<6, 1>;
@@ -750,6 +771,14 @@ def imm0_31 : Operand<i64>, ImmLeaf<i64, [{
  let ParserMatchClass = Imm0_31Operand;
}

// timm0_31 predicate - same ass imm0_31, but use TargetConstant (TimmLeaf)
// instead of Contant (ImmLeaf)
def timm0_31 : Operand<i64>, TImmLeaf<i64, [{
  return ((uint64_t)Imm) < 32;
}]> {
  let ParserMatchClass = Imm0_31Operand;
}

// True if the 32-bit immediate is in the range [0,31]
def imm32_0_31 : Operand<i32>, ImmLeaf<i32, [{
  return ((uint64_t)Imm) < 32;
+54 −32
Original line number Diff line number Diff line
@@ -20,6 +20,24 @@ def SDT_AArch64_GLD1_IMM : SDTypeProfile<1, 4, [
  SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1>
]>;

def SDT_AArch64_SST1 : SDTypeProfile<0, 5, [
  SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>, SDTCisVec<3>, SDTCisVT<4, OtherVT>,
  SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1>
]>;

def SDT_AArch64_SST1_IMM : SDTypeProfile<0, 5, [
  SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3>, SDTCisVT<4, OtherVT>,
  SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1>
]>;

def AArch64st1_scatter               : SDNode<"AArch64ISD::SST1",               SDT_AArch64_SST1,     [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>;
def AArch64st1_scatter_scaled        : SDNode<"AArch64ISD::SST1_SCALED",        SDT_AArch64_SST1,     [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>;
def AArch64st1_scatter_uxtw          : SDNode<"AArch64ISD::SST1_UXTW",          SDT_AArch64_SST1,     [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>;
def AArch64st1_scatter_sxtw          : SDNode<"AArch64ISD::SST1_SXTW",          SDT_AArch64_SST1,     [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>;
def AArch64st1_scatter_uxtw_scaled   : SDNode<"AArch64ISD::SST1_UXTW_SCALED",   SDT_AArch64_SST1,     [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>;
def AArch64st1_scatter_sxtw_scaled   : SDNode<"AArch64ISD::SST1_SXTW_SCALED",   SDT_AArch64_SST1,     [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>;
def AArch64st1_scatter_imm           : SDNode<"AArch64ISD::SST1_IMM",           SDT_AArch64_SST1_IMM, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>;

def AArch64ld1_gather                : SDNode<"AArch64ISD::GLD1",               SDT_AArch64_GLD1,     [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
def AArch64ld1_gather_scaled         : SDNode<"AArch64ISD::GLD1_SCALED",        SDT_AArch64_GLD1,     [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
def AArch64ld1_gather_uxtw           : SDNode<"AArch64ISD::GLD1_UXTW",          SDT_AArch64_GLD1,     [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
@@ -584,51 +602,55 @@ let Predicates = [HasSVE] in {
  defm ST1W_D : sve_mem_cst_ss<0b1011, "st1w", Z_d, ZPR64, GPR64NoXZRshifted32>;
  defm ST1D   : sve_mem_cst_ss<0b1111, "st1d", Z_d, ZPR64, GPR64NoXZRshifted64>;

  // Scatters using unscaled 32-bit offsets, e.g.
  //    st1h z0.s, p0, [x0, z0.s, uxtw]
  // and unpacked:
  // Scatters using unpacked, unscaled 32-bit offsets, e.g.
  //    st1h z0.d, p0, [x0, z0.d, uxtw]
  defm SST1B_D : sve_mem_sst_sv_32_unscaled<0b000, "st1b", Z_d, ZPR64, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only>;
  defm SST1B_S : sve_mem_sst_sv_32_unscaled<0b001, "st1b", Z_s, ZPR32, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only>;
  defm SST1H_D : sve_mem_sst_sv_32_unscaled<0b010, "st1h", Z_d, ZPR64, ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
  defm SST1H_S : sve_mem_sst_sv_32_unscaled<0b011, "st1h", Z_s, ZPR32, ZPR32ExtSXTW8, ZPR32ExtUXTW8>;
  defm SST1W_D : sve_mem_sst_sv_32_unscaled<0b100, "st1w", Z_d, ZPR64, ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
  defm SST1W   : sve_mem_sst_sv_32_unscaled<0b101, "st1w", Z_s, ZPR32, ZPR32ExtSXTW8, ZPR32ExtUXTW8>;
  defm SST1D   : sve_mem_sst_sv_32_unscaled<0b110, "st1d", Z_d, ZPR64, ZPR64ExtSXTW8, ZPR64ExtUXTW8>;

  // Scatters using scaled 32-bit offsets, e.g.
  defm SST1B_D : sve_mem_64b_sst_sv_32_unscaled<0b000, "st1b", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>;
  defm SST1H_D : sve_mem_64b_sst_sv_32_unscaled<0b010, "st1h", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>;
  defm SST1W_D : sve_mem_64b_sst_sv_32_unscaled<0b100, "st1w", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8,nxv2i32>;
  defm SST1D   : sve_mem_64b_sst_sv_32_unscaled<0b110, "st1d", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i64>;

  // Scatters using packed, unscaled 32-bit offsets, e.g.
  //    st1h z0.s, p0, [x0, z0.s, uxtw]
  defm SST1B_S : sve_mem_32b_sst_sv_32_unscaled<0b001, "st1b", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>;
  defm SST1H_S : sve_mem_32b_sst_sv_32_unscaled<0b011, "st1h", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>;
  defm SST1W   : sve_mem_32b_sst_sv_32_unscaled<0b101, "st1w", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i32>;

  // Scatters using packed, scaled 32-bit offsets, e.g.
  //    st1h z0.s, p0, [x0, z0.s, uxtw #1]
  // and unpacked:
  defm SST1H_S : sve_mem_32b_sst_sv_32_scaled<0b011, "st1h", AArch64st1_scatter_sxtw_scaled, AArch64st1_scatter_uxtw_scaled, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>;
  defm SST1W   : sve_mem_32b_sst_sv_32_scaled<0b101, "st1w", AArch64st1_scatter_sxtw_scaled, AArch64st1_scatter_uxtw_scaled, ZPR32ExtSXTW32, ZPR32ExtUXTW32, nxv4i32>;

  // Scatters using unpacked, scaled 32-bit offsets, e.g.
  //    st1h z0.d, p0, [x0, z0.d, uxtw #1]
  defm SST1H_D : sve_mem_sst_sv_32_scaled<0b010, "st1h", Z_d, ZPR64, ZPR64ExtSXTW16, ZPR64ExtUXTW16>;
  defm SST1H_S : sve_mem_sst_sv_32_scaled<0b011, "st1h", Z_s, ZPR32, ZPR32ExtSXTW16, ZPR32ExtUXTW16>;
  defm SST1W_D : sve_mem_sst_sv_32_scaled<0b100, "st1w", Z_d, ZPR64, ZPR64ExtSXTW32, ZPR64ExtUXTW32>;
  defm SST1W   : sve_mem_sst_sv_32_scaled<0b101, "st1w", Z_s, ZPR32, ZPR32ExtSXTW32, ZPR32ExtUXTW32>;
  defm SST1D   : sve_mem_sst_sv_32_scaled<0b110, "st1d", Z_d, ZPR64, ZPR64ExtSXTW64, ZPR64ExtUXTW64>;
  defm SST1H_D : sve_mem_64b_sst_sv_32_scaled<0b010, "st1h", AArch64st1_scatter_sxtw_scaled, AArch64st1_scatter_uxtw_scaled, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>;
  defm SST1W_D : sve_mem_64b_sst_sv_32_scaled<0b100, "st1w", AArch64st1_scatter_sxtw_scaled, AArch64st1_scatter_uxtw_scaled, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>;
  defm SST1D   : sve_mem_64b_sst_sv_32_scaled<0b110, "st1d", AArch64st1_scatter_sxtw_scaled, AArch64st1_scatter_uxtw_scaled, ZPR64ExtSXTW64, ZPR64ExtUXTW64, nxv2i64>;

  // Scatters using 32/64-bit pointers with offset, e.g.
  //    st1h z0.s, p0, [z0.s, #16]
  defm SST1B_S : sve_mem_32b_sst_vi_ptrs<0b001, "st1b", timm0_31, AArch64st1_scatter_imm, nxv4i8>;
  defm SST1H_S : sve_mem_32b_sst_vi_ptrs<0b011, "st1h", tuimm5s2, AArch64st1_scatter_imm, nxv4i16>;
  defm SST1W   : sve_mem_32b_sst_vi_ptrs<0b101, "st1w", tuimm5s4, AArch64st1_scatter_imm, nxv4i32>;

  // Scatters using 32/64-bit pointers with offset, e.g.
  //    st1h z0.d, p0, [z0.d, #16]
  defm SST1B_D : sve_mem_sst_vi_ptrs<0b000, "st1b", Z_d, ZPR64, imm0_31>;
  defm SST1B_S : sve_mem_sst_vi_ptrs<0b001, "st1b", Z_s, ZPR32, imm0_31>;
  defm SST1H_D : sve_mem_sst_vi_ptrs<0b010, "st1h", Z_d, ZPR64, uimm5s2>;
  defm SST1H_S : sve_mem_sst_vi_ptrs<0b011, "st1h", Z_s, ZPR32, uimm5s2>;
  defm SST1W_D : sve_mem_sst_vi_ptrs<0b100, "st1w", Z_d, ZPR64, uimm5s4>;
  defm SST1W   : sve_mem_sst_vi_ptrs<0b101, "st1w", Z_s, ZPR32, uimm5s4>;
  defm SST1D   : sve_mem_sst_vi_ptrs<0b110, "st1d", Z_d, ZPR64, uimm5s8>;
  defm SST1B_D : sve_mem_64b_sst_vi_ptrs<0b000, "st1b", timm0_31, AArch64st1_scatter_imm, nxv2i8>;
  defm SST1H_D : sve_mem_64b_sst_vi_ptrs<0b010, "st1h", tuimm5s2, AArch64st1_scatter_imm, nxv2i16>;
  defm SST1W_D : sve_mem_64b_sst_vi_ptrs<0b100, "st1w", tuimm5s4, AArch64st1_scatter_imm, nxv2i32>;
  defm SST1D   : sve_mem_64b_sst_vi_ptrs<0b110, "st1d", tuimm5s8, AArch64st1_scatter_imm, nxv2i64>;

  // Scatters using unscaled 64-bit offsets, e.g.
  //    st1h z0.d, p0, [x0, z0.d]
  defm SST1B_D : sve_mem_sst_sv_64_unscaled<0b00, "st1b">;
  defm SST1H_D : sve_mem_sst_sv_64_unscaled<0b01, "st1h">;
  defm SST1W_D : sve_mem_sst_sv_64_unscaled<0b10, "st1w">;
  defm SST1D   : sve_mem_sst_sv_64_unscaled<0b11, "st1d">;
  defm SST1B_D : sve_mem_sst_sv_64_unscaled<0b00, "st1b", AArch64st1_scatter, nxv2i8>;
  defm SST1H_D : sve_mem_sst_sv_64_unscaled<0b01, "st1h", AArch64st1_scatter, nxv2i16>;
  defm SST1W_D : sve_mem_sst_sv_64_unscaled<0b10, "st1w", AArch64st1_scatter, nxv2i32>;
  defm SST1D   : sve_mem_sst_sv_64_unscaled<0b11, "st1d", AArch64st1_scatter, nxv2i64>;

  // Scatters using scaled 64-bit offsets, e.g.
  //    st1h z0.d, p0, [x0, z0.d, lsl #1]
  defm SST1H_D_SCALED : sve_mem_sst_sv_64_scaled<0b01, "st1h", ZPR64ExtLSL16>;
  defm SST1W_D_SCALED : sve_mem_sst_sv_64_scaled<0b10, "st1w", ZPR64ExtLSL32>;
  defm SST1D_SCALED   : sve_mem_sst_sv_64_scaled<0b11, "st1d", ZPR64ExtLSL64>;
  defm SST1H_D_SCALED : sve_mem_sst_sv_64_scaled<0b01, "st1h", AArch64st1_scatter_scaled, ZPR64ExtLSL16, nxv2i16>;
  defm SST1W_D_SCALED : sve_mem_sst_sv_64_scaled<0b10, "st1w", AArch64st1_scatter_scaled, ZPR64ExtLSL32, nxv2i32>;
  defm SST1D_SCALED   : sve_mem_sst_sv_64_scaled<0b11, "st1d", AArch64st1_scatter_scaled, ZPR64ExtLSL64, nxv2i64>;

  // ST(2|3|4) structured stores (register + immediate)
  defm ST2B_IMM : sve_mem_est_si<0b00, 0b01, ZZ_b,   "st2b", simm4s2>;
Loading