Commit 2939fc13 authored by Sanne Wouda's avatar Sanne Wouda
Browse files

[AArch64] Add IR intrinsics for sq(r)dmulh_lane(q)

Summary:
Currently, sqdmulh_lane and friends from the ACLE (implemented in arm_neon.h),
are represented in LLVM IR as a (by vector) sqdmulh and a vector of (repeated)
indices, like so:

   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)

When %v's values are known, the shufflevector is optimized away and we are no
longer able to select the lane variant of sqdmulh in the backend.

This defeats a (hand-coded) optimization that packs several constants into a
single vector and uses the lane intrinsics to reduce register pressure and
trade-off materialising several constants for a single vector load from the
constant pool, like so:

   int16x8_t v = {2,3,4,5,6,7,8,9};
   a = vqdmulh_laneq_s16(a, v, 0);
   b = vqdmulh_laneq_s16(b, v, 1);
   c = vqdmulh_laneq_s16(c, v, 2);
   d = vqdmulh_laneq_s16(d, v, 3);
   [...]

In one microbenchmark from libjpeg-turbo this accounts for a 2.5% to 4%
performance difference.

We could teach the compiler to recover the lane variants, but this would likely
require its own pass.  (Alternatively, "volatile" could be used on the constants
vector, but this is a bit ugly.)

This patch instead implements the following LLVM IR intrinsics for AArch64 to
maintain the original structure through IR optmization and into instruction
selection:
- sqdmulh_lane
- sqdmulh_laneq
- sqrdmulh_lane
- sqrdmulh_laneq.

These 'lane' variants need an additional register class.  The second argument
must be in the lower half of the 64-bit NEON register file, but only when
operating on i16 elements.

Note that the existing patterns for shufflevector and sqdmulh into sqdmulh_lane
(etc.) remain, so code that does not rely on NEON intrinsics to generate these
instructions is not affected.

This patch also changes clang to emit these IR intrinsics for the corresponding
NEON intrinsics (AArch64 only).

Reviewers: SjoerdMeijer, dmgreen, t.p.northover, rovka, rengolin, efriedma

Reviewed By: efriedma

Subscribers: kristof.beyls, hiraditya, jdoerfert, cfe-commits, llvm-commits

Tags: #clang, #llvm

Differential Revision: https://reviews.llvm.org/D71469
parent f719b0ba
Loading
Loading
Loading
Loading
+12 −4
Original line number Diff line number Diff line
@@ -528,9 +528,16 @@ def VMULL_LANE : SOpInst<"vmull_lane", "(>Q)..I", "siUsUi", OP_MULL_LN>;
def VQDMULL_N     : SOpInst<"vqdmull_n", "(>Q).1", "si", OP_QDMULL_N>;
def VQDMULL_LANE  : SOpInst<"vqdmull_lane", "(>Q)..I", "si", OP_QDMULL_LN>;
def VQDMULH_N     : SOpInst<"vqdmulh_n", "..1", "siQsQi", OP_QDMULH_N>;
def VQDMULH_LANE  : SOpInst<"vqdmulh_lane", "..qI", "siQsQi", OP_QDMULH_LN>;
def VQRDMULH_N    : SOpInst<"vqrdmulh_n", "..1", "siQsQi", OP_QRDMULH_N>;

let ArchGuard = "!defined(__aarch64__)" in {
def VQDMULH_LANE  : SOpInst<"vqdmulh_lane", "..qI", "siQsQi", OP_QDMULH_LN>;
def VQRDMULH_LANE : SOpInst<"vqrdmulh_lane", "..qI", "siQsQi", OP_QRDMULH_LN>;
}
let ArchGuard = "defined(__aarch64__)" in {
def A64_VQDMULH_LANE  : SInst<"vqdmulh_lane", "..qI", "siQsQi">;
def A64_VQRDMULH_LANE : SInst<"vqrdmulh_lane", "..qI", "siQsQi">;
}

let ArchGuard = "defined(__ARM_FEATURE_QRDMX)" in {
def VQRDMLAH_LANE : SOpInst<"vqrdmlah_lane", "...qI", "siQsQi", OP_QRDMLAH_LN>;
@@ -951,9 +958,10 @@ def VQDMULL_HIGH_LANE : SOpInst<"vqdmull_high_lane", "(>Q)Q.I", "si",
def VQDMULL_HIGH_LANEQ  : SOpInst<"vqdmull_high_laneq", "(>Q)QQI", "si",
                                  OP_QDMULLHi_LN>;

def VQDMULH_LANEQ  : SOpInst<"vqdmulh_laneq", "..QI", "siQsQi", OP_QDMULH_LN>;
def VQRDMULH_LANEQ : SOpInst<"vqrdmulh_laneq", "..QI", "siQsQi", OP_QRDMULH_LN>;

let isLaneQ = 1 in {
def VQDMULH_LANEQ  : SInst<"vqdmulh_laneq", "..QI", "siQsQi">;
def VQRDMULH_LANEQ : SInst<"vqrdmulh_laneq", "..QI", "siQsQi">;
}
let ArchGuard = "defined(__ARM_FEATURE_QRDMX) && defined(__aarch64__)" in {
def VQRDMLAH_LANEQ : SOpInst<"vqrdmlah_laneq", "...QI", "siQsQi", OP_QRDMLAH_LN>;
def VQRDMLSH_LANEQ : SOpInst<"vqrdmlsh_laneq", "...QI", "siQsQi", OP_QRDMLSH_LN>;
+26 −0
Original line number Diff line number Diff line
@@ -4969,14 +4969,22 @@ static const NeonIntrinsicInfo AArch64SIMDIntrinsicMap[] = {
  NEONMAP2(vqaddq_v, aarch64_neon_uqadd, aarch64_neon_sqadd, Add1ArgType | UnsignedAlts),
  NEONMAP2(vqdmlal_v, aarch64_neon_sqdmull, aarch64_neon_sqadd, 0),
  NEONMAP2(vqdmlsl_v, aarch64_neon_sqdmull, aarch64_neon_sqsub, 0),
  NEONMAP1(vqdmulh_lane_v, aarch64_neon_sqdmulh_lane, 0),
  NEONMAP1(vqdmulh_laneq_v, aarch64_neon_sqdmulh_laneq, 0),
  NEONMAP1(vqdmulh_v, aarch64_neon_sqdmulh, Add1ArgType),
  NEONMAP1(vqdmulhq_lane_v, aarch64_neon_sqdmulh_lane, 0),
  NEONMAP1(vqdmulhq_laneq_v, aarch64_neon_sqdmulh_laneq, 0),
  NEONMAP1(vqdmulhq_v, aarch64_neon_sqdmulh, Add1ArgType),
  NEONMAP1(vqdmull_v, aarch64_neon_sqdmull, Add1ArgType),
  NEONMAP2(vqmovn_v, aarch64_neon_uqxtn, aarch64_neon_sqxtn, Add1ArgType | UnsignedAlts),
  NEONMAP1(vqmovun_v, aarch64_neon_sqxtun, Add1ArgType),
  NEONMAP1(vqneg_v, aarch64_neon_sqneg, Add1ArgType),
  NEONMAP1(vqnegq_v, aarch64_neon_sqneg, Add1ArgType),
  NEONMAP1(vqrdmulh_lane_v, aarch64_neon_sqrdmulh_lane, 0),
  NEONMAP1(vqrdmulh_laneq_v, aarch64_neon_sqrdmulh_laneq, 0),
  NEONMAP1(vqrdmulh_v, aarch64_neon_sqrdmulh, Add1ArgType),
  NEONMAP1(vqrdmulhq_lane_v, aarch64_neon_sqrdmulh_lane, 0),
  NEONMAP1(vqrdmulhq_laneq_v, aarch64_neon_sqrdmulh_laneq, 0),
  NEONMAP1(vqrdmulhq_v, aarch64_neon_sqrdmulh, Add1ArgType),
  NEONMAP2(vqrshl_v, aarch64_neon_uqrshl, aarch64_neon_sqrshl, Add1ArgType | UnsignedAlts),
  NEONMAP2(vqrshlq_v, aarch64_neon_uqrshl, aarch64_neon_sqrshl, Add1ArgType | UnsignedAlts),
@@ -5754,6 +5762,24 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr(
    Ops.resize(2);
    return EmitNeonCall(CGM.getIntrinsic(AltLLVMIntrinsic, Ty), Ops, NameHint);
  }
  case NEON::BI__builtin_neon_vqdmulhq_lane_v:
  case NEON::BI__builtin_neon_vqdmulh_lane_v:
  case NEON::BI__builtin_neon_vqrdmulhq_lane_v:
  case NEON::BI__builtin_neon_vqrdmulh_lane_v: {
    llvm::Type *Tys[2] = {
        Ty, GetNeonType(this, NeonTypeFlags(Type.getEltType(), false,
                                            /*isQuad*/ false))};
    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, NameHint);
  }
  case NEON::BI__builtin_neon_vqdmulhq_laneq_v:
  case NEON::BI__builtin_neon_vqdmulh_laneq_v:
  case NEON::BI__builtin_neon_vqrdmulhq_laneq_v:
  case NEON::BI__builtin_neon_vqrdmulh_laneq_v: {
    llvm::Type *Tys[2] = {
        Ty, GetNeonType(this, NeonTypeFlags(Type.getEltType(), false,
                                            /*isQuad*/ true))};
    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, NameHint);
  }
  case NEON::BI__builtin_neon_vqshl_n_v:
  case NEON::BI__builtin_neon_vqshlq_n_v:
    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshl_n",
+160 −160

File changed.

Preview size limit exceeded, changes collapsed.

+8 −0
Original line number Diff line number Diff line
@@ -133,6 +133,10 @@ let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.".
    : Intrinsic<[llvm_anyvector_ty],
                [LLVMHalfElementsVectorType<0>, llvm_anyvector_ty],
                [IntrNoMem]>;
  class AdvSIMD_2VectorArg_Lane_Intrinsic
    : Intrinsic<[llvm_anyint_ty],
                [LLVMMatchType<0>, llvm_anyint_ty, llvm_i32_ty],
                [IntrNoMem]>;

  class AdvSIMD_3VectorArg_Intrinsic
      : Intrinsic<[llvm_anyvector_ty],
@@ -207,9 +211,13 @@ let TargetPrefix = "aarch64", IntrProperties = [IntrNoMem] in {

  // Vector Saturating Doubling Multiply High
  def int_aarch64_neon_sqdmulh : AdvSIMD_2IntArg_Intrinsic;
  def int_aarch64_neon_sqdmulh_lane : AdvSIMD_2VectorArg_Lane_Intrinsic;
  def int_aarch64_neon_sqdmulh_laneq : AdvSIMD_2VectorArg_Lane_Intrinsic;

  // Vector Saturating Rounding Doubling Multiply High
  def int_aarch64_neon_sqrdmulh : AdvSIMD_2IntArg_Intrinsic;
  def int_aarch64_neon_sqrdmulh_lane : AdvSIMD_2VectorArg_Lane_Intrinsic;
  def int_aarch64_neon_sqrdmulh_laneq : AdvSIMD_2VectorArg_Lane_Intrinsic;

  // Vector Polynominal Multiply
  def int_aarch64_neon_pmul : AdvSIMD_2VectorArg_Intrinsic;
+61 −0
Original line number Diff line number Diff line
@@ -360,6 +360,9 @@ def am_indexed7s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S128", []>;
def am_indexedu6s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexedU6S128", []>;
def am_indexeds9s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexedS9S128", []>;

def UImmS1XForm : SDNodeXForm<imm, [{
  return CurDAG->getTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i64);
}]>;
def UImmS2XForm : SDNodeXForm<imm, [{
  return CurDAG->getTargetConstant(N->getZExtValue() / 2, SDLoc(N), MVT::i64);
}]>;
@@ -7968,6 +7971,64 @@ multiclass SIMDFPIndexedTied<bit U, bits<4> opc, string asm> {
  }
}

multiclass SIMDIndexedHSPatterns<SDPatternOperator OpNodeLane,
                                 SDPatternOperator OpNodeLaneQ> {

  def : Pat<(v4i16 (OpNodeLane
                     (v4i16 V64:$Rn), (v4i16 V64_lo:$Rm),
                     VectorIndexS32b:$idx)),
            (!cast<Instruction>(NAME # v4i16_indexed) $Rn,
              (SUBREG_TO_REG (i32 0), (v4i16 V64_lo:$Rm), dsub),
              (UImmS1XForm $idx))>;

  def : Pat<(v4i16 (OpNodeLaneQ
                     (v4i16 V64:$Rn), (v8i16 V128_lo:$Rm),
                     VectorIndexH32b:$idx)),
            (!cast<Instruction>(NAME # v4i16_indexed) $Rn, $Rm,
              (UImmS1XForm $idx))>;

  def : Pat<(v8i16 (OpNodeLane
                     (v8i16 V128:$Rn), (v4i16 V64_lo:$Rm),
                     VectorIndexS32b:$idx)),
            (!cast<Instruction>(NAME # v8i16_indexed) $Rn,
              (SUBREG_TO_REG (i32 0), $Rm, dsub),
              (UImmS1XForm $idx))>;

  def : Pat<(v8i16 (OpNodeLaneQ
                     (v8i16 V128:$Rn), (v8i16 V128_lo:$Rm),
                     VectorIndexH32b:$idx)),
            (!cast<Instruction>(NAME # v8i16_indexed) $Rn, $Rm,
              (UImmS1XForm $idx))>;

  def : Pat<(v2i32 (OpNodeLane
                     (v2i32 V64:$Rn), (v2i32 V64:$Rm),
                     VectorIndexD32b:$idx)),
            (!cast<Instruction>(NAME # v2i32_indexed) $Rn,
              (SUBREG_TO_REG (i32 0), (v2i32 V64_lo:$Rm), dsub),
              (UImmS1XForm $idx))>;

  def : Pat<(v2i32 (OpNodeLaneQ
                     (v2i32 V64:$Rn), (v4i32 V128:$Rm),
                     VectorIndexS32b:$idx)),
            (!cast<Instruction>(NAME # v2i32_indexed) $Rn, $Rm,
              (UImmS1XForm $idx))>;

  def : Pat<(v4i32 (OpNodeLane
                     (v4i32 V128:$Rn), (v2i32 V64:$Rm),
                     VectorIndexD32b:$idx)),
            (!cast<Instruction>(NAME # v4i32_indexed) $Rn,
              (SUBREG_TO_REG (i32 0), $Rm, dsub),
              (UImmS1XForm $idx))>;

  def : Pat<(v4i32 (OpNodeLaneQ
                     (v4i32 V128:$Rn),
                     (v4i32 V128:$Rm),
                     VectorIndexS32b:$idx)),
            (!cast<Instruction>(NAME # v4i32_indexed) $Rn, $Rm,
              (UImmS1XForm $idx))>;

}

multiclass SIMDIndexedHS<bit U, bits<4> opc, string asm,
                         SDPatternOperator OpNode> {
  def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc, V64, V64,
Loading