Commit 535ed62c authored by Florian Hahn's avatar Florian Hahn Committed by Florian Hahn
Browse files

[AArch64] Add custom store lowering for 256 bit non-temporal stores.

Currently we fail to lower non-termporal stores for 256+ bit vectors
to STNPQ, because type legalization will split them up to 128 bit stores
and because there are no single non-temporal stores, creating STPNQ
in the Load/Store optimizer would be quite tricky.

This patch adds custom lowering for 256 bit non-temporal vector stores
to improve the generated code.

Reviewers: dmgreen, samparker, t.p.northover, ab

Reviewed By: dmgreen

Differential Revision: https://reviews.llvm.org/D72919
parent e47965bf
Loading
Loading
Loading
Loading
+36 −0
Original line number Diff line number Diff line
@@ -525,6 +525,17 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
  setOperationAction(ISD::LOAD, MVT::i128, Custom);
  setOperationAction(ISD::STORE, MVT::i128, Custom);
  // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
  // custom lowering, as there are no un-paired non-temporal stores and
  // legalization will break up 256 bit inputs.
  setOperationAction(ISD::STORE, MVT::v32i8, Custom);
  setOperationAction(ISD::STORE, MVT::v16i16, Custom);
  setOperationAction(ISD::STORE, MVT::v16f16, Custom);
  setOperationAction(ISD::STORE, MVT::v8i32, Custom);
  setOperationAction(ISD::STORE, MVT::v8f32, Custom);
  setOperationAction(ISD::STORE, MVT::v4f64, Custom);
  setOperationAction(ISD::STORE, MVT::v4i64, Custom);
  // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0.
  // This requires the Performance Monitors extension.
  if (Subtarget->hasPerfMon())
@@ -1382,6 +1393,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
  case AArch64ISD::SST1_IMM:          return "AArch64ISD::SST1_IMM";
  case AArch64ISD::LDP:               return "AArch64ISD::LDP";
  case AArch64ISD::STP:               return "AArch64ISD::STP";
  case AArch64ISD::STNP:              return "AArch64ISD::STNP";
  }
  return nullptr;
}
@@ -3070,6 +3082,30 @@ SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
    if (StoreNode->isTruncatingStore()) {
      return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
    }
    // 256 bit non-temporal stores can be lowered to STNP. Do this as part of
    // the custom lowering, as there are no un-paired non-temporal stores and
    // legalization will break up 256 bit inputs.
    if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == 256u &&
        MemVT.getVectorElementCount().Min % 2u == 0 &&
        ((MemVT.getScalarSizeInBits() == 8u ||
          MemVT.getScalarSizeInBits() == 16u ||
          MemVT.getScalarSizeInBits() == 32u ||
          MemVT.getScalarSizeInBits() == 64u))) {
      SDValue Lo =
          DAG.getNode(ISD::EXTRACT_SUBVECTOR, Dl,
                      MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
                      StoreNode->getValue(), DAG.getConstant(0, Dl, MVT::i64));
      SDValue Hi = DAG.getNode(
          ISD::EXTRACT_SUBVECTOR, Dl,
          MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
          StoreNode->getValue(),
          DAG.getConstant(MemVT.getVectorElementCount().Min / 2, Dl, MVT::i64));
      SDValue Result = DAG.getMemIntrinsicNode(
          AArch64ISD::STNP, Dl, DAG.getVTList(MVT::Other),
          {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
          StoreNode->getMemoryVT(), StoreNode->getMemOperand());
      return Result;
    }
  } else if (MemVT == MVT::i128 && StoreNode->isVolatile()) {
    assert(StoreNode->getValue()->getValueType(0) == MVT::i128);
    SDValue Lo =
+2 −1
Original line number Diff line number Diff line
@@ -272,7 +272,8 @@ enum NodeType : unsigned {
  STZ2G,

  LDP,
  STP
  STP,
  STNP
};

} // end namespace AArch64ISD
+6 −0
Original line number Diff line number Diff line
@@ -245,6 +245,7 @@ def SDT_AArch64TLSDescCall : SDTypeProfile<0, -2, [SDTCisPtrTy<0>,

def SDT_AArch64ldp : SDTypeProfile<2, 1, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
def SDT_AArch64stp : SDTypeProfile<0, 3, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
def SDT_AArch64stnp : SDTypeProfile<0, 3, [SDTCisVT<0, v4i32>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;

// Generates the general dynamic sequences, i.e.
//  adrp  x0, :tlsdesc:var
@@ -544,6 +545,7 @@ def AArch64uunpklo : SDNode<"AArch64ISD::UUNPKLO", SDT_AArch64unpk>;

def AArch64ldp : SDNode<"AArch64ISD::LDP", SDT_AArch64ldp, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
def AArch64stp : SDNode<"AArch64ISD::STP", SDT_AArch64stp, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
def AArch64stnp : SDNode<"AArch64ISD::STNP", SDT_AArch64stnp, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;

def AArch64tbl : SDNode<"AArch64ISD::TBL", SDT_AArch64TBL>;

@@ -2734,6 +2736,10 @@ defm STNPQ : StorePairNoAlloc<0b10, 1, FPR128Op, simm7s16, "stnp">;
def : Pat<(AArch64stp GPR64z:$Rt, GPR64z:$Rt2, (am_indexed7s64 GPR64sp:$Rn, simm7s8:$offset)),
          (STPXi GPR64z:$Rt, GPR64z:$Rt2, GPR64sp:$Rn, simm7s8:$offset)>;

def : Pat<(AArch64stnp FPR128:$Rt, FPR128:$Rt2, (am_indexed7s128 GPR64sp:$Rn, simm7s16:$offset)),
          (STNPQi FPR128:$Rt, FPR128:$Rt2, GPR64sp:$Rn, simm7s16:$offset)>;


//---
// (Register offset)

+144 −4
Original line number Diff line number Diff line
@@ -2,10 +2,7 @@

define void @test_stnp_v4i64(<4 x i64>* %p, <4 x i64> %v) #0 {
; CHECK-LABEL: test_stnp_v4i64:
; CHECK-NEXT:  mov d[[HI1:[0-9]+]], v1[1]
; CHECK-NEXT:  mov d[[HI0:[0-9]+]], v0[1]
; CHECK-NEXT:  stnp d1, d[[HI1]], [x0, #16]
; CHECK-NEXT:  stnp d0, d[[HI0]], [x0]
; CHECK-NEXT:  stnp q0, q1, [x0]
; CHECK-NEXT:  ret
  store <4 x i64> %v, <4 x i64>* %p, align 1, !nontemporal !0
  ret void
@@ -334,6 +331,149 @@ define void @test_stnp_v4f32_offset_alloca_2(<4 x float> %v) #0 {
  ret void
}

define void @test_stnp_v32i8(<32 x i8> %v, <32 x i8>* %ptr) {
; CHECK-LABEL: _test_stnp_v32i8:
; CHECK-NEXT:    .cfi_startproc
; CHECK-NEXT:    stnp    q0, q1, [x0]
; CHECK-NEXT:    ret

entry:
  store <32 x i8> %v, <32 x i8>* %ptr, align 4, !nontemporal !0
  ret void
}

define void @test_stnp_v32i16(<32 x i16> %v, <32 x i16>* %ptr) {
; CHECK-LABEL: _test_stnp_v32i16:
; CHECK-NEXT:    .cfi_startproc
; CHECK-NEXT:    stnp    q2, q3, [x0, #32]
; CHECK-NEXT:    stnp    q0, q1, [x0]
; CHECK-NEXT:    ret

entry:
  store <32 x i16> %v, <32 x i16>* %ptr, align 4, !nontemporal !0
  ret void
}

define void @test_stnp_v32f16(<32 x half> %v, <32 x half>* %ptr) {
; CHECK-LABEL: _test_stnp_v32f16:
; CHECK-NEXT:    .cfi_startproc
; CHECK-NEXT:    stnp    q2, q3, [x0, #32]
; CHECK-NEXT:    stnp    q0, q1, [x0]
; CHECK-NEXT:    ret

entry:
  store <32 x half> %v, <32 x half>* %ptr, align 4, !nontemporal !0
  ret void
}

define void @test_stnp_v16i32(<16 x i32> %v, <16 x i32>* %ptr) {
; CHECK-LABEL: _test_stnp_v16i32:
; CHECK-NEXT:    .cfi_startproc
; CHECK-NEXT:    stnp    q2, q3, [x0, #32]
; CHECK-NEXT:    stnp    q0, q1, [x0]
; CHECK-NEXT:    ret

entry:
  store <16 x i32> %v, <16 x i32>* %ptr, align 4, !nontemporal !0
  ret void
}

define void @test_stnp_v16f32(<16 x float> %v, <16 x float>* %ptr) {
; CHECK-LABEL: _test_stnp_v16f32:
; CHECK-NEXT:    .cfi_startproc
; CHECK-NEXT:    stnp    q2, q3, [x0, #32]
; CHECK-NEXT:    stnp    q0, q1, [x0]
; CHECK-NEXT:    ret

entry:
  store <16 x float> %v, <16 x float>* %ptr, align 4, !nontemporal !0
  ret void
}

define void @test_stnp_v17f32(<17 x float> %v, <17 x float>* %ptr) {
; CHECK-LABEL: _test_stnp_v17f32:
; CHECK-NEXT:	.cfi_startproc
; CHECK-NEXT:	ldr	s16, [sp, #16]
; CHECK-NEXT:	mov.s	v0[1], v1[0]
; CHECK-NEXT:	mov.s	v4[1], v5[0]
; CHECK-NEXT:	ldr	s1, [sp]
; CHECK-NEXT:	add	x8, sp, #20
; CHECK-NEXT:	ld1.s	{ v16 }[1], [x8]
; CHECK-NEXT:	add	x8, sp, #4
; CHECK-NEXT:	ld1.s	{ v1 }[1], [x8]
; CHECK-NEXT:	add	x8, sp, #24
; CHECK-NEXT:	ld1.s	{ v16 }[2], [x8]
; CHECK-NEXT:	add	x8, sp, #8
; CHECK-NEXT:	ld1.s	{ v1 }[2], [x8]
; CHECK-NEXT:	add	x8, sp, #28
; CHECK-NEXT:	ld1.s	{ v16 }[3], [x8]
; CHECK-NEXT:	add	x8, sp, #12
; CHECK-NEXT:	mov.s	v0[2], v2[0]
; CHECK-NEXT:	ldr	s2, [sp, #32]
; CHECK-NEXT:	mov.s	v4[2], v6[0]
; CHECK-NEXT:	mov.s	v0[3], v3[0]
; CHECK-NEXT:	mov.s	v4[3], v7[0]
; CHECK-NEXT:	mov	d3, v4[1]
; CHECK-NEXT:	mov	d5, v0[1]
; CHECK-NEXT:	ld1.s	{ v1 }[3], [x8]
; CHECK-NEXT:	stnp	d4, d3, [x0, #16]
; CHECK-NEXT:	stnp	d0, d5, [x0]
; CHECK-NEXT:	mov	d0, v16[1]
; CHECK-NEXT:	mov	d3, v1[1]
; CHECK-NEXT:	stnp	d16, d0, [x0, #48]
; CHECK-NEXT:	stnp	d1, d3, [x0, #32]
; CHECK-NEXT:	str	s2, [x0, #64]
; CHECK-NEXT:	ret

entry:
  store <17 x float> %v, <17 x float>* %ptr, align 4, !nontemporal !0
  ret void
}
define void @test_stnp_v16i32_invalid_offset(<16 x i32> %v, <16 x i32>* %ptr) {
; CHECK-LABEL: _test_stnp_v16i32_invalid_offset:
; CHECK-NEXT:    .cfi_startproc
; CHECK-NEXT:    mov w8, #32000
; CHECK-NEXT:    mov w9, #32032
; CHECK-NEXT:    add x8, x0, x8
; CHECK-NEXT:    add x9, x0, x9
; CHECK-NEXT:    stnp    q2, q3, [x9]
; CHECK-NEXT:    stnp    q0, q1, [x8]
; CHECK-NEXT:    ret

entry:
  %gep = getelementptr <16 x i32>, <16 x i32>* %ptr, i32 500
  store <16 x i32> %v, <16 x i32>* %gep, align 4, !nontemporal !0
  ret void
}

define void @test_stnp_v16f64(<16 x double> %v, <16 x double>* %ptr) {
; CHECK-LABEL: _test_stnp_v16f64:
; CHECK-NEXT:    .cfi_startproc
; CHECK-NEXT:    stnp    q6, q7, [x0, #96]
; CHECK-NEXT:    stnp    q4, q5, [x0, #64]
; CHECK-NEXT:    stnp    q2, q3, [x0, #32]
; CHECK-NEXT:    stnp    q0, q1, [x0]
; CHECK-NEXT:    ret

entry:
  store <16 x double> %v, <16 x double>* %ptr, align 4, !nontemporal !0
  ret void
}

define void @test_stnp_v16i64(<16 x i64> %v, <16 x i64>* %ptr) {
; CHECK-LABEL: _test_stnp_v16i64:
; CHECK-NEXT:    .cfi_startproc
; CHECK-NEXT:    stnp    q6, q7, [x0, #96]
; CHECK-NEXT:    stnp    q4, q5, [x0, #64]
; CHECK-NEXT:    stnp    q2, q3, [x0, #32]
; CHECK-NEXT:    stnp    q0, q1, [x0]
; CHECK-NEXT:    ret

entry:
  store <16 x i64> %v, <16 x i64>* %ptr, align 4, !nontemporal !0
  ret void
}

!0 = !{ i32 1 }

attributes #0 = { nounwind }