Commit 09c8f389 authored by Craig Topper's avatar Craig Topper
Browse files

[X86] Add isel patterns for X86VBroadcast with i16 truncates from i16->i64 zextload/extload.

We can form vpbroadcastw with a folded load.

We had patterns for i16->i32 zextload/extload, but nothing prevents
i64 from occuring.

I'd like to move this all to DAG combine to fix more cases, but
this is trivial fix to minimize test diffs when moving to a combine.
parent 51a4c612
Loading
Loading
Loading
Loading
+26 −6
Original line number Diff line number Diff line
@@ -1427,26 +1427,46 @@ let Predicates = [HasVLX, HasBWI] in {
  // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
  // This means we'll encounter truncated i32 loads; match that here.
  def : Pat<(v8i16 (X86VBroadcast
              (i16 (trunc (i32 (extloadi16 addr:$src)))))),
              (i16 (trunc (extloadi32i16 addr:$src))))),
            (VPBROADCASTWZ128rm addr:$src)>;
  def : Pat<(v8i16 (X86VBroadcast
              (i16 (trunc (i32 (zextloadi16 addr:$src)))))),
              (i16 (trunc (zextloadi32i16 addr:$src))))),
            (VPBROADCASTWZ128rm addr:$src)>;
  def : Pat<(v16i16 (X86VBroadcast
              (i16 (trunc (i32 (extloadi16 addr:$src)))))),
              (i16 (trunc (extloadi32i16 addr:$src))))),
            (VPBROADCASTWZ256rm addr:$src)>;
  def : Pat<(v16i16 (X86VBroadcast
              (i16 (trunc (i32 (zextloadi16 addr:$src)))))),
              (i16 (trunc (zextloadi32i16 addr:$src))))),
            (VPBROADCASTWZ256rm addr:$src)>;
  def : Pat<(v8i16 (X86VBroadcast
              (i16 (trunc (extloadi64i16 addr:$src))))),
            (VPBROADCASTWZ128rm addr:$src)>;
  def : Pat<(v8i16 (X86VBroadcast
              (i16 (trunc (zextloadi64i16 addr:$src))))),
            (VPBROADCASTWZ128rm addr:$src)>;
  def : Pat<(v16i16 (X86VBroadcast
              (i16 (trunc (extloadi64i16 addr:$src))))),
            (VPBROADCASTWZ256rm addr:$src)>;
  def : Pat<(v16i16 (X86VBroadcast
              (i16 (trunc (zextloadi64i16 addr:$src))))),
            (VPBROADCASTWZ256rm addr:$src)>;
}
let Predicates = [HasBWI] in {
  // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
  // This means we'll encounter truncated i32 loads; match that here.
  def : Pat<(v32i16 (X86VBroadcast
              (i16 (trunc (i32 (extloadi16 addr:$src)))))),
              (i16 (trunc (extloadi32i16 addr:$src))))),
            (VPBROADCASTWZrm addr:$src)>;
  def : Pat<(v32i16 (X86VBroadcast
              (i16 (trunc (zextloadi32i16 addr:$src))))),
            (VPBROADCASTWZrm addr:$src)>;
  def : Pat<(v32i16 (X86VBroadcast
              (i16 (trunc (extloadi64i16 addr:$src))))),
            (VPBROADCASTWZrm addr:$src)>;
  def : Pat<(v32i16 (X86VBroadcast
              (i16 (trunc (i32 (zextloadi16 addr:$src)))))),
              (i16 (trunc (zextloadi64i16 addr:$src))))),
            (VPBROADCASTWZrm addr:$src)>;
}
+17 −4
Original line number Diff line number Diff line
@@ -7518,16 +7518,29 @@ let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
  // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
  // This means we'll encounter truncated i32 loads; match that here.
  def : Pat<(v8i16 (X86VBroadcast
              (i16 (trunc (i32 (extloadi16 addr:$src)))))),
              (i16 (trunc (extloadi32i16 addr:$src))))),
            (VPBROADCASTWrm addr:$src)>;
  def : Pat<(v8i16 (X86VBroadcast
              (i16 (trunc (i32 (zextloadi16 addr:$src)))))),
              (i16 (trunc (zextloadi32i16 addr:$src))))),
            (VPBROADCASTWrm addr:$src)>;
  def : Pat<(v16i16 (X86VBroadcast
              (i16 (trunc (i32 (extloadi16 addr:$src)))))),
              (i16 (trunc (extloadi32i16 addr:$src))))),
            (VPBROADCASTWYrm addr:$src)>;
  def : Pat<(v16i16 (X86VBroadcast
              (i16 (trunc (i32 (zextloadi16 addr:$src)))))),
              (i16 (trunc (zextloadi32i16 addr:$src))))),
            (VPBROADCASTWYrm addr:$src)>;

  def : Pat<(v8i16 (X86VBroadcast
              (i16 (trunc (extloadi64i16 addr:$src))))),
            (VPBROADCASTWrm addr:$src)>;
  def : Pat<(v8i16 (X86VBroadcast
              (i16 (trunc (zextloadi64i16 addr:$src))))),
            (VPBROADCASTWrm addr:$src)>;
  def : Pat<(v16i16 (X86VBroadcast
              (i16 (trunc (extloadi64i16 addr:$src))))),
            (VPBROADCASTWYrm addr:$src)>;
  def : Pat<(v16i16 (X86VBroadcast
              (i16 (trunc (zextloadi64i16 addr:$src))))),
            (VPBROADCASTWYrm addr:$src)>;
}

+15 −45
Original line number Diff line number Diff line
@@ -3331,18 +3331,10 @@ define <8 x i16> @insert_dup_elt3_mem_v8i16_i64(i64* %ptr) {
; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT:    retq
;
; AVX2-LABEL: insert_dup_elt3_mem_v8i16_i64:
; AVX2:       # %bb.0:
; AVX2-NEXT:    movzwl 6(%rdi), %eax
; AVX2-NEXT:    vmovd %eax, %xmm0
; AVX2-NEXT:    vpbroadcastw %xmm0, %xmm0
; AVX2-NEXT:    retq
;
; AVX512VL-LABEL: insert_dup_elt3_mem_v8i16_i64:
; AVX512VL:       # %bb.0:
; AVX512VL-NEXT:    movzwl 6(%rdi), %eax
; AVX512VL-NEXT:    vpbroadcastw %eax, %xmm0
; AVX512VL-NEXT:    retq
; AVX2OR512VL-LABEL: insert_dup_elt3_mem_v8i16_i64:
; AVX2OR512VL:       # %bb.0:
; AVX2OR512VL-NEXT:    vpbroadcastw 6(%rdi), %xmm0
; AVX2OR512VL-NEXT:    retq
;
; XOPAVX1-LABEL: insert_dup_elt3_mem_v8i16_i64:
; XOPAVX1:       # %bb.0:
@@ -3353,9 +3345,7 @@ define <8 x i16> @insert_dup_elt3_mem_v8i16_i64(i64* %ptr) {
;
; XOPAVX2-LABEL: insert_dup_elt3_mem_v8i16_i64:
; XOPAVX2:       # %bb.0:
; XOPAVX2-NEXT:    movzwl 6(%rdi), %eax
; XOPAVX2-NEXT:    vmovd %eax, %xmm0
; XOPAVX2-NEXT:    vpbroadcastw %xmm0, %xmm0
; XOPAVX2-NEXT:    vpbroadcastw 6(%rdi), %xmm0
; XOPAVX2-NEXT:    retq
  %tmp = load i64, i64* %ptr, align 4
  %tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 0
@@ -3392,18 +3382,10 @@ define <8 x i16> @insert_dup_elt7_mem_v8i16_i64(i64* %ptr) {
; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
; AVX1-NEXT:    retq
;
; AVX2-LABEL: insert_dup_elt7_mem_v8i16_i64:
; AVX2:       # %bb.0:
; AVX2-NEXT:    movzwl 6(%rdi), %eax
; AVX2-NEXT:    vmovd %eax, %xmm0
; AVX2-NEXT:    vpbroadcastw %xmm0, %xmm0
; AVX2-NEXT:    retq
;
; AVX512VL-LABEL: insert_dup_elt7_mem_v8i16_i64:
; AVX512VL:       # %bb.0:
; AVX512VL-NEXT:    movzwl 6(%rdi), %eax
; AVX512VL-NEXT:    vpbroadcastw %eax, %xmm0
; AVX512VL-NEXT:    retq
; AVX2OR512VL-LABEL: insert_dup_elt7_mem_v8i16_i64:
; AVX2OR512VL:       # %bb.0:
; AVX2OR512VL-NEXT:    vpbroadcastw 6(%rdi), %xmm0
; AVX2OR512VL-NEXT:    retq
;
; XOPAVX1-LABEL: insert_dup_elt7_mem_v8i16_i64:
; XOPAVX1:       # %bb.0:
@@ -3414,9 +3396,7 @@ define <8 x i16> @insert_dup_elt7_mem_v8i16_i64(i64* %ptr) {
;
; XOPAVX2-LABEL: insert_dup_elt7_mem_v8i16_i64:
; XOPAVX2:       # %bb.0:
; XOPAVX2-NEXT:    movzwl 6(%rdi), %eax
; XOPAVX2-NEXT:    vmovd %eax, %xmm0
; XOPAVX2-NEXT:    vpbroadcastw %xmm0, %xmm0
; XOPAVX2-NEXT:    vpbroadcastw 6(%rdi), %xmm0
; XOPAVX2-NEXT:    retq
  %tmp = load i64, i64* %ptr, align 4
  %tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 1
@@ -3442,18 +3422,10 @@ define <8 x i16> @insert_dup_mem_v8i16_sext_i16_i64(i16* %ptr) {
; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT:    retq
;
; AVX2-LABEL: insert_dup_mem_v8i16_sext_i16_i64:
; AVX2:       # %bb.0:
; AVX2-NEXT:    movzwl (%rdi), %eax
; AVX2-NEXT:    vmovd %eax, %xmm0
; AVX2-NEXT:    vpbroadcastw %xmm0, %xmm0
; AVX2-NEXT:    retq
;
; AVX512VL-LABEL: insert_dup_mem_v8i16_sext_i16_i64:
; AVX512VL:       # %bb.0:
; AVX512VL-NEXT:    movzwl (%rdi), %eax
; AVX512VL-NEXT:    vpbroadcastw %eax, %xmm0
; AVX512VL-NEXT:    retq
; AVX2OR512VL-LABEL: insert_dup_mem_v8i16_sext_i16_i64:
; AVX2OR512VL:       # %bb.0:
; AVX2OR512VL-NEXT:    vpbroadcastw (%rdi), %xmm0
; AVX2OR512VL-NEXT:    retq
;
; XOPAVX1-LABEL: insert_dup_mem_v8i16_sext_i16_i64:
; XOPAVX1:       # %bb.0:
@@ -3465,9 +3437,7 @@ define <8 x i16> @insert_dup_mem_v8i16_sext_i16_i64(i16* %ptr) {
;
; XOPAVX2-LABEL: insert_dup_mem_v8i16_sext_i16_i64:
; XOPAVX2:       # %bb.0:
; XOPAVX2-NEXT:    movzwl (%rdi), %eax
; XOPAVX2-NEXT:    vmovd %eax, %xmm0
; XOPAVX2-NEXT:    vpbroadcastw %xmm0, %xmm0
; XOPAVX2-NEXT:    vpbroadcastw (%rdi), %xmm0
; XOPAVX2-NEXT:    retq
  %tmp = load i16, i16* %ptr, align 2
  %tmp1 = sext i16 %tmp to i64
+15 −45
Original line number Diff line number Diff line
@@ -7546,18 +7546,10 @@ define <16 x i16> @insert_dup_elt3_mem_v16i16_i64(i64* %ptr) {
; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT:    retq
;
; AVX2-LABEL: insert_dup_elt3_mem_v16i16_i64:
; AVX2:       # %bb.0:
; AVX2-NEXT:    movzwl 6(%rdi), %eax
; AVX2-NEXT:    vmovd %eax, %xmm0
; AVX2-NEXT:    vpbroadcastw %xmm0, %ymm0
; AVX2-NEXT:    retq
;
; AVX512VL-LABEL: insert_dup_elt3_mem_v16i16_i64:
; AVX512VL:       # %bb.0:
; AVX512VL-NEXT:    movzwl 6(%rdi), %eax
; AVX512VL-NEXT:    vpbroadcastw %eax, %ymm0
; AVX512VL-NEXT:    retq
; AVX2OR512VL-LABEL: insert_dup_elt3_mem_v16i16_i64:
; AVX2OR512VL:       # %bb.0:
; AVX2OR512VL-NEXT:    vpbroadcastw 6(%rdi), %ymm0
; AVX2OR512VL-NEXT:    retq
;
; XOPAVX1-LABEL: insert_dup_elt3_mem_v16i16_i64:
; XOPAVX1:       # %bb.0:
@@ -7569,9 +7561,7 @@ define <16 x i16> @insert_dup_elt3_mem_v16i16_i64(i64* %ptr) {
;
; XOPAVX2-LABEL: insert_dup_elt3_mem_v16i16_i64:
; XOPAVX2:       # %bb.0:
; XOPAVX2-NEXT:    movzwl 6(%rdi), %eax
; XOPAVX2-NEXT:    vmovd %eax, %xmm0
; XOPAVX2-NEXT:    vpbroadcastw %xmm0, %ymm0
; XOPAVX2-NEXT:    vpbroadcastw 6(%rdi), %ymm0
; XOPAVX2-NEXT:    retq
  %tmp = load i64, i64* %ptr, align 4
  %tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 0
@@ -7588,18 +7578,10 @@ define <16 x i16> @insert_dup_elt7_mem_v16i16_i64(i64* %ptr) {
; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT:    retq
;
; AVX2-LABEL: insert_dup_elt7_mem_v16i16_i64:
; AVX2:       # %bb.0:
; AVX2-NEXT:    movzwl 6(%rdi), %eax
; AVX2-NEXT:    vmovd %eax, %xmm0
; AVX2-NEXT:    vpbroadcastw %xmm0, %ymm0
; AVX2-NEXT:    retq
;
; AVX512VL-LABEL: insert_dup_elt7_mem_v16i16_i64:
; AVX512VL:       # %bb.0:
; AVX512VL-NEXT:    movzwl 6(%rdi), %eax
; AVX512VL-NEXT:    vpbroadcastw %eax, %ymm0
; AVX512VL-NEXT:    retq
; AVX2OR512VL-LABEL: insert_dup_elt7_mem_v16i16_i64:
; AVX2OR512VL:       # %bb.0:
; AVX2OR512VL-NEXT:    vpbroadcastw 6(%rdi), %ymm0
; AVX2OR512VL-NEXT:    retq
;
; XOPAVX1-LABEL: insert_dup_elt7_mem_v16i16_i64:
; XOPAVX1:       # %bb.0:
@@ -7610,9 +7592,7 @@ define <16 x i16> @insert_dup_elt7_mem_v16i16_i64(i64* %ptr) {
;
; XOPAVX2-LABEL: insert_dup_elt7_mem_v16i16_i64:
; XOPAVX2:       # %bb.0:
; XOPAVX2-NEXT:    movzwl 6(%rdi), %eax
; XOPAVX2-NEXT:    vmovd %eax, %xmm0
; XOPAVX2-NEXT:    vpbroadcastw %xmm0, %ymm0
; XOPAVX2-NEXT:    vpbroadcastw 6(%rdi), %ymm0
; XOPAVX2-NEXT:    retq
  %tmp = load i64, i64* %ptr, align 4
  %tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 1
@@ -7631,18 +7611,10 @@ define <16 x i16> @insert_dup_mem_v16i16_sext_i16_i64(i16* %ptr) {
; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT:    retq
;
; AVX2-LABEL: insert_dup_mem_v16i16_sext_i16_i64:
; AVX2:       # %bb.0:
; AVX2-NEXT:    movzwl (%rdi), %eax
; AVX2-NEXT:    vmovd %eax, %xmm0
; AVX2-NEXT:    vpbroadcastw %xmm0, %ymm0
; AVX2-NEXT:    retq
;
; AVX512VL-LABEL: insert_dup_mem_v16i16_sext_i16_i64:
; AVX512VL:       # %bb.0:
; AVX512VL-NEXT:    movzwl (%rdi), %eax
; AVX512VL-NEXT:    vpbroadcastw %eax, %ymm0
; AVX512VL-NEXT:    retq
; AVX2OR512VL-LABEL: insert_dup_mem_v16i16_sext_i16_i64:
; AVX2OR512VL:       # %bb.0:
; AVX2OR512VL-NEXT:    vpbroadcastw (%rdi), %ymm0
; AVX2OR512VL-NEXT:    retq
;
; XOPAVX1-LABEL: insert_dup_mem_v16i16_sext_i16_i64:
; XOPAVX1:       # %bb.0:
@@ -7655,9 +7627,7 @@ define <16 x i16> @insert_dup_mem_v16i16_sext_i16_i64(i16* %ptr) {
;
; XOPAVX2-LABEL: insert_dup_mem_v16i16_sext_i16_i64:
; XOPAVX2:       # %bb.0:
; XOPAVX2-NEXT:    movzwl (%rdi), %eax
; XOPAVX2-NEXT:    vmovd %eax, %xmm0
; XOPAVX2-NEXT:    vpbroadcastw %xmm0, %ymm0
; XOPAVX2-NEXT:    vpbroadcastw (%rdi), %ymm0
; XOPAVX2-NEXT:    retq
  %tmp = load i16, i16* %ptr, align 2
  %tmp1 = sext i16 %tmp to i64
+6 −15
Original line number Diff line number Diff line
@@ -333,16 +333,13 @@ define <32 x i16> @insert_dup_elt1_mem_v16i16_i64(i64* %ptr) {
define <32 x i16> @insert_dup_elt3_mem_v16i16_i64(i64* %ptr) {
; KNL-LABEL: insert_dup_elt3_mem_v16i16_i64:
; KNL:       ## %bb.0:
; KNL-NEXT:    movzwl 6(%rdi), %eax
; KNL-NEXT:    vmovd %eax, %xmm0
; KNL-NEXT:    vpbroadcastw %xmm0, %ymm0
; KNL-NEXT:    vpbroadcastw 6(%rdi), %ymm0
; KNL-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
; KNL-NEXT:    retq
;
; SKX-LABEL: insert_dup_elt3_mem_v16i16_i64:
; SKX:       ## %bb.0:
; SKX-NEXT:    movzwl 6(%rdi), %eax
; SKX-NEXT:    vpbroadcastw %eax, %zmm0
; SKX-NEXT:    vpbroadcastw 6(%rdi), %zmm0
; SKX-NEXT:    retq
  %tmp = load i64, i64* %ptr, align 4
  %tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 0
@@ -354,16 +351,13 @@ define <32 x i16> @insert_dup_elt3_mem_v16i16_i64(i64* %ptr) {
define <32 x i16> @insert_dup_elt7_mem_v16i16_i64(i64* %ptr) {
; KNL-LABEL: insert_dup_elt7_mem_v16i16_i64:
; KNL:       ## %bb.0:
; KNL-NEXT:    movzwl 6(%rdi), %eax
; KNL-NEXT:    vmovd %eax, %xmm0
; KNL-NEXT:    vpbroadcastw %xmm0, %ymm0
; KNL-NEXT:    vpbroadcastw 6(%rdi), %ymm0
; KNL-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
; KNL-NEXT:    retq
;
; SKX-LABEL: insert_dup_elt7_mem_v16i16_i64:
; SKX:       ## %bb.0:
; SKX-NEXT:    movzwl 6(%rdi), %eax
; SKX-NEXT:    vpbroadcastw %eax, %zmm0
; SKX-NEXT:    vpbroadcastw 6(%rdi), %zmm0
; SKX-NEXT:    retq
  %tmp = load i64, i64* %ptr, align 4
  %tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 1
@@ -375,16 +369,13 @@ define <32 x i16> @insert_dup_elt7_mem_v16i16_i64(i64* %ptr) {
define <32 x i16> @insert_dup_mem_v16i16_sext_i16_i64(i16* %ptr) {
; KNL-LABEL: insert_dup_mem_v16i16_sext_i16_i64:
; KNL:       ## %bb.0:
; KNL-NEXT:    movzwl (%rdi), %eax
; KNL-NEXT:    vmovd %eax, %xmm0
; KNL-NEXT:    vpbroadcastw %xmm0, %ymm0
; KNL-NEXT:    vpbroadcastw (%rdi), %ymm0
; KNL-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
; KNL-NEXT:    retq
;
; SKX-LABEL: insert_dup_mem_v16i16_sext_i16_i64:
; SKX:       ## %bb.0:
; SKX-NEXT:    movzwl (%rdi), %eax
; SKX-NEXT:    vpbroadcastw %eax, %zmm0
; SKX-NEXT:    vpbroadcastw (%rdi), %zmm0
; SKX-NEXT:    retq
  %tmp = load i16, i16* %ptr, align 2
  %tmp1 = sext i16 %tmp to i64