Commit 13f2aee7 authored by Simon Pilgrim's avatar Simon Pilgrim
Browse files

[X86][AVX] Generalize vperm2f128/vperm2i128 patterns to support all legal 256-bit vector types

Remove bitcasts to/from v4x64 types through vperm2f128/vperm2i128 ops to help improve shuffle combining and demanded vector elts folding.
parent 6e530a3d
Loading
Loading
Loading
Loading
+24 −15
Original line number Diff line number Diff line
@@ -35436,7 +35436,6 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
                            DL, 256);
    }
    MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
    if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128)
      return SDValue(); // Nothing to do!
@@ -35449,12 +35448,9 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
      unsigned PermMask = 0;
      PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
      PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
      Res = CanonicalizeShuffleInput(ShuffleVT, V1);
      Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
                        DAG.getUNDEF(ShuffleVT),
                        DAG.getTargetConstant(PermMask, DL, MVT::i8));
      return DAG.getBitcast(RootVT, Res);
      return DAG.getNode(
          X86ISD::VPERM2X128, DL, RootVT, CanonicalizeShuffleInput(RootVT, V1),
          DAG.getUNDEF(RootVT), DAG.getTargetConstant(PermMask, DL, MVT::i8));
    }
    if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
@@ -35470,14 +35466,12 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
        unsigned PermMask = 0;
        PermMask |= ((BaseMask[0] & 3) << 0);
        PermMask |= ((BaseMask[1] & 3) << 4);
        SDValue LHS = isInRange(BaseMask[0], 0, 2) ? V1 : V2;
        SDValue RHS = isInRange(BaseMask[1], 0, 2) ? V1 : V2;
        Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT,
                          CanonicalizeShuffleInput(ShuffleVT, LHS),
                          CanonicalizeShuffleInput(ShuffleVT, RHS),
        return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT,
                          CanonicalizeShuffleInput(RootVT, LHS),
                          CanonicalizeShuffleInput(RootVT, RHS),
                          DAG.getTargetConstant(PermMask, DL, MVT::i8));
        return DAG.getBitcast(RootVT, Res);
      }
    }
  }
@@ -37323,10 +37317,25 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
    return SDValue();
  }
  case X86ISD::VPERM2X128: {
    // Fold vperm2x128(bitcast(x),bitcast(y),c) -> bitcast(vperm2x128(x,y,c)).
    SDValue LHS = N->getOperand(0);
    SDValue RHS = N->getOperand(1);
    if (LHS.getOpcode() == ISD::BITCAST &&
        (RHS.getOpcode() == ISD::BITCAST || RHS.isUndef())) {
      EVT SrcVT = LHS.getOperand(0).getValueType();
      if (RHS.isUndef() || SrcVT == RHS.getOperand(0).getValueType()) {
        return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT,
                                              DAG.getBitcast(SrcVT, LHS),
                                              DAG.getBitcast(SrcVT, RHS),
                                              N->getOperand(2)));
      }
    }
    // Fold vperm2x128(op(),op()) -> op(vperm2x128(),vperm2x128()).
    if (SDValue Res = canonicalizeLaneShuffleWithRepeatedOps(N, DAG, DL))
      return Res;
    // Combine vperm2x128 subvector shuffle with an inner concat pattern.
    // Fold vperm2x128 subvector shuffle with an inner concat pattern.
    // vperm2x128(concat(X,Y),concat(Z,W)) --> concat X,Y etc.  
    auto FindSubVector128 = [&](unsigned Idx) {
      if (Idx > 3)
+31 −34
Original line number Diff line number Diff line
@@ -7287,16 +7287,12 @@ let ExeDomain = SSEPackedSingle in {
let isCommutable = 1 in
def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst),
          (ins VR256:$src1, VR256:$src2, u8imm:$src3),
          "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
          [(set VR256:$dst, (v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2,
                              (i8 timm:$src3))))]>, VEX_4V, VEX_L,
          Sched<[WriteFShuffle256]>;
          "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
          VEX_4V, VEX_L, Sched<[WriteFShuffle256]>;
def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst),
          (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
          "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
          [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4f64 addr:$src2),
                             (i8 timm:$src3)))]>, VEX_4V, VEX_L,
          Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>;
          "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
          VEX_4V, VEX_L, Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>;
}

// Immediate transform to help with commuting.
@@ -7304,23 +7300,27 @@ def Perm2XCommuteImm : SDNodeXForm<timm, [{
  return getI8Imm(N->getZExtValue() ^ 0x22, SDLoc(N));
}]>;

let Predicates = [HasAVX] in {
multiclass vperm2x128_lowering<string InstrStr, ValueType VT, PatFrag memop_frag> {
  def : Pat<(VT (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 timm:$imm))),
            (!cast<Instruction>(InstrStr#rr) VR256:$src1, VR256:$src2, timm:$imm)>;
  def : Pat<(VT (X86VPerm2x128 VR256:$src1, (memop_frag addr:$src2), (i8 timm:$imm))),
            (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2, timm:$imm)>;
  // Pattern with load in other operand.
def : Pat<(v4f64 (X86VPerm2x128 (loadv4f64 addr:$src2),
                                VR256:$src1, (i8 timm:$imm))),
          (VPERM2F128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm timm:$imm))>;
  def : Pat<(VT (X86VPerm2x128 (memop_frag addr:$src2), VR256:$src1, (i8 timm:$imm))),
            (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2,
                                             (Perm2XCommuteImm timm:$imm))>;
}

let Predicates = [HasAVX] in {
  defm : vperm2x128_lowering<"VPERM2F128", v4f64, loadv4f64>;
  defm : vperm2x128_lowering<"VPERM2F128", v8f32, loadv8f32>;
}

let Predicates = [HasAVX1Only] in {
def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 timm:$imm))),
          (VPERM2F128rr VR256:$src1, VR256:$src2, timm:$imm)>;
def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1,
                  (loadv4i64 addr:$src2), (i8 timm:$imm))),
          (VPERM2F128rm VR256:$src1, addr:$src2, timm:$imm)>;
// Pattern with load in other operand.
def : Pat<(v4i64 (X86VPerm2x128 (loadv4i64 addr:$src2),
                                VR256:$src1, (i8 timm:$imm))),
          (VPERM2F128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm timm:$imm))>;
  defm : vperm2x128_lowering<"VPERM2F128", v4i64,  loadv4i64>;
  defm : vperm2x128_lowering<"VPERM2F128", v8i32,  loadv8i32>;
  defm : vperm2x128_lowering<"VPERM2F128", v16i16, loadv16i16>;
  defm : vperm2x128_lowering<"VPERM2F128", v32i8,  loadv32i8>;
}

//===----------------------------------------------------------------------===//
@@ -7689,27 +7689,24 @@ defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64,
                             WriteFShuffle256, f256mem>, VEX_W;

//===----------------------------------------------------------------------===//
// VPERM2I128 - Permute Floating-Point Values in 128-bit chunks
// VPERM2I128 - Permute Integer vector Values in 128-bit chunks
//
let isCommutable = 1 in
def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst),
          (ins VR256:$src1, VR256:$src2, u8imm:$src3),
          "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
          [(set VR256:$dst, (v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2,
                            (i8 timm:$src3))))]>, Sched<[WriteShuffle256]>,
          VEX_4V, VEX_L;
          "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
          Sched<[WriteShuffle256]>, VEX_4V, VEX_L;
def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst),
          (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
          "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
          [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4i64 addr:$src2),
                             (i8 timm:$src3)))]>,
          "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
          Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;

let Predicates = [HasAVX2] in
def : Pat<(v4i64 (X86VPerm2x128 (loadv4i64 addr:$src2),
                                VR256:$src1, (i8 timm:$imm))),
          (VPERM2I128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm timm:$imm))>;

let Predicates = [HasAVX2] in {
  defm : vperm2x128_lowering<"VPERM2I128", v4i64,  loadv4i64>;
  defm : vperm2x128_lowering<"VPERM2I128", v8i32,  loadv8i32>;
  defm : vperm2x128_lowering<"VPERM2I128", v16i16, loadv16i16>;
  defm : vperm2x128_lowering<"VPERM2I128", v32i8,  loadv32i8>;
}

//===----------------------------------------------------------------------===//
// VINSERTI128 - Insert packed integer values
+4 −6
Original line number Diff line number Diff line
@@ -587,9 +587,8 @@ define <8 x i32> @avx2_vphadd_d_test(<8 x i32> %A, <8 x i32> %B) {
;
; AVX2-LABEL: avx2_vphadd_d_test:
; AVX2:       # %bb.0:
; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT:    vphaddd %ymm2, %ymm0, %ymm0
; AVX2-NEXT:    vphaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT:    retq
  %vecext = extractelement <8 x i32> %A, i32 0
  %vecext1 = extractelement <8 x i32> %A, i32 1
@@ -743,9 +742,8 @@ define <16 x i16> @avx2_vphadd_w_test(<16 x i16> %a, <16 x i16> %b) nounwind {
;
; AVX2-LABEL: avx2_vphadd_w_test:
; AVX2:       # %bb.0:
; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT:    vphaddw %ymm2, %ymm0, %ymm0
; AVX2-NEXT:    vphaddw %ymm1, %ymm0, %ymm0
; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT:    retq
  %vecext = extractelement <16 x i16> %a, i32 0
  %vecext1 = extractelement <16 x i16> %a, i32 1
+2 −3
Original line number Diff line number Diff line
@@ -5138,9 +5138,8 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, <32 x i8>* %p, <32 x i8> %ma
; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT:    vpand %ymm4, %ymm1, %ymm1
; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm0[2,3],ymm1[2,3]
; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT:    vpackuswb %ymm4, %ymm0, %ymm0
; AVX2-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT:    vpcmpeqb %ymm3, %ymm2, %ymm1
; AVX2-NEXT:    vpmovmskb %ymm1, %eax
; AVX2-NEXT:    notl %eax
+4 −8
Original line number Diff line number Diff line
@@ -626,16 +626,14 @@ define <8 x i32> @var_shuffle_v8i32_from_v4i32(<4 x i32> %v, <8 x i32> %indices)
; XOP-LABEL: var_shuffle_v8i32_from_v4i32:
; XOP:       # %bb.0: # %entry
; XOP-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
; XOP-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3]
; XOP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
; XOP-NEXT:    vpermil2ps $0, %ymm1, %ymm2, %ymm0, %ymm0
; XOP-NEXT:    vpermil2ps $0, %ymm1, %ymm0, %ymm0, %ymm0
; XOP-NEXT:    retq
;
; AVX1-LABEL: var_shuffle_v8i32_from_v4i32:
; AVX1:       # %bb.0: # %entry
; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3]
; AVX1-NEXT:    vpermilps %ymm1, %ymm2, %ymm2
; AVX1-NEXT:    vpermilps %ymm1, %ymm0, %ymm2
; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT:    vpermilps %ymm1, %ymm0, %ymm0
; AVX1-NEXT:    vpcmpgtd {{.*}}(%rip), %xmm1, %xmm3
@@ -1049,16 +1047,14 @@ define <8 x float> @var_shuffle_v8f32_from_v4f32(<4 x float> %v, <8 x i32> %indi
; XOP-LABEL: var_shuffle_v8f32_from_v4f32:
; XOP:       # %bb.0: # %entry
; XOP-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
; XOP-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3]
; XOP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
; XOP-NEXT:    vpermil2ps $0, %ymm1, %ymm2, %ymm0, %ymm0
; XOP-NEXT:    vpermil2ps $0, %ymm1, %ymm0, %ymm0, %ymm0
; XOP-NEXT:    retq
;
; AVX1-LABEL: var_shuffle_v8f32_from_v4f32:
; AVX1:       # %bb.0: # %entry
; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3]
; AVX1-NEXT:    vpermilps %ymm1, %ymm2, %ymm2
; AVX1-NEXT:    vpermilps %ymm1, %ymm0, %ymm2
; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT:    vpermilps %ymm1, %ymm0, %ymm0
; AVX1-NEXT:    vpcmpgtd {{.*}}(%rip), %xmm1, %xmm3
Loading