Commit e9e1daf2 authored by David Green's avatar David Green
Browse files

[ARM] Remove VHADD patterns

These instructions do not work quite like I expected them to. They
perform the addition and then shift in a higher precision integer, so do
not match up with the patterns that we added.

For example with s8s, adding 100 and 100 should wrap leaving the shift
to work on a negative number. VHADD will instead do the arithmetic in
higher precision, giving 100 overall. The vhadd gives a "better" result,
but not one that matches up with the input.

I am just removing the patterns here. We might be able to re-add them in
the future by checking for wrap flags or changing bitwidths. But for the
moment just remove them to remove the problem cases.
parent 44b99428
Loading
Loading
Loading
Loading
+0 −54
Original line number Diff line number Diff line
@@ -1785,60 +1785,6 @@ def MVE_VHSUBu8 : MVE_VHSUB<"u8", 0b1, 0b00>;
def MVE_VHSUBu16 : MVE_VHSUB<"u16", 0b1, 0b01>;
def MVE_VHSUBu32 : MVE_VHSUB<"u32", 0b1, 0b10>;

let Predicates = [HasMVEInt] in {
  def : Pat<(v16i8 (ARMvshrsImm
                     (v16i8 (add (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))), 1)),
            (v16i8 (MVE_VHADDs8
                     (v16i8 MQPR:$v1), (v16i8 MQPR:$v2)))>;
  def : Pat<(v8i16 (ARMvshrsImm
                     (v8i16 (add (v8i16 MQPR:$v1), (v8i16 MQPR:$v2))), 1)),
            (v8i16 (MVE_VHADDs16
                     (v8i16 MQPR:$v1), (v8i16 MQPR:$v2)))>;
  def : Pat<(v4i32 (ARMvshrsImm
                     (v4i32 (add (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))), 1)),
            (v4i32 (MVE_VHADDs32
                     (v4i32 MQPR:$v1), (v4i32 MQPR:$v2)))>;

  def : Pat<(v16i8 (ARMvshruImm
                     (v16i8 (add (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))), 1)),
            (v16i8 (MVE_VHADDu8
                     (v16i8 MQPR:$v1), (v16i8 MQPR:$v2)))>;
  def : Pat<(v8i16 (ARMvshruImm
                     (v8i16 (add (v8i16 MQPR:$v1), (v8i16 MQPR:$v2))), 1)),
            (v8i16 (MVE_VHADDu16
                     (v8i16 MQPR:$v1), (v8i16 MQPR:$v2)))>;
  def : Pat<(v4i32 (ARMvshruImm
                     (v4i32 (add (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))), 1)),
            (v4i32 (MVE_VHADDu32
                     (v4i32 MQPR:$v1), (v4i32 MQPR:$v2)))>;

  def : Pat<(v16i8 (ARMvshrsImm
                     (v16i8 (sub (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))), 1)),
            (v16i8 (MVE_VHSUBs8
                     (v16i8 MQPR:$v1), (v16i8 MQPR:$v2)))>;
  def : Pat<(v8i16 (ARMvshrsImm
                     (v8i16 (sub (v8i16 MQPR:$v1), (v8i16 MQPR:$v2))), 1)),
            (v8i16 (MVE_VHSUBs16
                     (v8i16 MQPR:$v1), (v8i16 MQPR:$v2)))>;
  def : Pat<(v4i32 (ARMvshrsImm
                     (v4i32 (sub (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))), 1)),
            (v4i32 (MVE_VHSUBs32
                     (v4i32 MQPR:$v1), (v4i32 MQPR:$v2)))>;

  def : Pat<(v16i8 (ARMvshruImm
                     (v16i8 (sub (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))), 1)),
            (v16i8 (MVE_VHSUBu8
                     (v16i8 MQPR:$v1), (v16i8 MQPR:$v2)))>;
  def : Pat<(v8i16 (ARMvshruImm
                     (v8i16 (sub (v8i16 MQPR:$v1), (v8i16 MQPR:$v2))), 1)),
            (v8i16 (MVE_VHSUBu16
                     (v8i16 MQPR:$v1), (v8i16 MQPR:$v2)))>;
  def : Pat<(v4i32 (ARMvshruImm
                     (v4i32 (sub (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))), 1)),
            (v4i32 (MVE_VHSUBu32
                     (v4i32 MQPR:$v1), (v4i32 MQPR:$v2)))>;
}

class MVE_VDUP<string suffix, bit B, bit E, list<dag> pattern=[]>
  : MVE_p<(outs MQPR:$Qd), (ins rGPR:$Rt), NoItinerary,
          "vdup", suffix, "$Qd, $Rt", vpred_r, "", pattern> {
+48 −24
Original line number Diff line number Diff line
@@ -4,7 +4,8 @@
define arm_aapcs_vfpcc <16 x i8> @add_ashr_v16i8(<16 x i8> %src1, <16 x i8> %src2) {
; CHECK-LABEL: add_ashr_v16i8:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vhadd.s8 q0, q0, q1
; CHECK-NEXT:    vadd.i8 q0, q0, q1
; CHECK-NEXT:    vshr.s8 q0, q0, #1
; CHECK-NEXT:    bx lr
entry:
  %0 = add <16 x i8> %src1, %src2
@@ -15,7 +16,8 @@ entry:
define arm_aapcs_vfpcc <8 x i16> @add_ashr_v8i16(<8 x i16> %src1, <8 x i16> %src2) {
; CHECK-LABEL: add_ashr_v8i16:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vhadd.s16 q0, q0, q1
; CHECK-NEXT:    vadd.i16 q0, q0, q1
; CHECK-NEXT:    vshr.s16 q0, q0, #1
; CHECK-NEXT:    bx lr
entry:
  %0 = add <8 x i16> %src1, %src2
@@ -26,7 +28,8 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @add_ashr_v4i32(<4 x i32> %src1, <4 x i32> %src2) {
; CHECK-LABEL: add_ashr_v4i32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vhadd.s32 q0, q0, q1
; CHECK-NEXT:    vadd.i32 q0, q0, q1
; CHECK-NEXT:    vshr.s32 q0, q0, #1
; CHECK-NEXT:    bx lr
entry:
  %0 = add nsw <4 x i32> %src1, %src2
@@ -37,7 +40,8 @@ entry:
define arm_aapcs_vfpcc <16 x i8> @add_lshr_v16i8(<16 x i8> %src1, <16 x i8> %src2) {
; CHECK-LABEL: add_lshr_v16i8:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vhadd.u8 q0, q0, q1
; CHECK-NEXT:    vadd.i8 q0, q0, q1
; CHECK-NEXT:    vshr.u8 q0, q0, #1
; CHECK-NEXT:    bx lr
entry:
  %0 = add <16 x i8> %src1, %src2
@@ -48,7 +52,8 @@ entry:
define arm_aapcs_vfpcc <8 x i16> @add_lshr_v8i16(<8 x i16> %src1, <8 x i16> %src2) {
; CHECK-LABEL: add_lshr_v8i16:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vhadd.u16 q0, q0, q1
; CHECK-NEXT:    vadd.i16 q0, q0, q1
; CHECK-NEXT:    vshr.u16 q0, q0, #1
; CHECK-NEXT:    bx lr
entry:
  %0 = add <8 x i16> %src1, %src2
@@ -59,7 +64,8 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @add_lshr_v4i32(<4 x i32> %src1, <4 x i32> %src2) {
; CHECK-LABEL: add_lshr_v4i32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vhadd.u32 q0, q0, q1
; CHECK-NEXT:    vadd.i32 q0, q0, q1
; CHECK-NEXT:    vshr.u32 q0, q0, #1
; CHECK-NEXT:    bx lr
entry:
  %0 = add nsw <4 x i32> %src1, %src2
@@ -70,7 +76,8 @@ entry:
define arm_aapcs_vfpcc <16 x i8> @sub_ashr_v16i8(<16 x i8> %src1, <16 x i8> %src2) {
; CHECK-LABEL: sub_ashr_v16i8:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vhsub.s8 q0, q0, q1
; CHECK-NEXT:    vsub.i8 q0, q0, q1
; CHECK-NEXT:    vshr.s8 q0, q0, #1
; CHECK-NEXT:    bx lr
entry:
  %0 = sub <16 x i8> %src1, %src2
@@ -81,7 +88,8 @@ entry:
define arm_aapcs_vfpcc <8 x i16> @sub_ashr_v8i16(<8 x i16> %src1, <8 x i16> %src2) {
; CHECK-LABEL: sub_ashr_v8i16:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vhsub.s16 q0, q0, q1
; CHECK-NEXT:    vsub.i16 q0, q0, q1
; CHECK-NEXT:    vshr.s16 q0, q0, #1
; CHECK-NEXT:    bx lr
entry:
  %0 = sub <8 x i16> %src1, %src2
@@ -92,7 +100,8 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @sub_ashr_v4i32(<4 x i32> %src1, <4 x i32> %src2) {
; CHECK-LABEL: sub_ashr_v4i32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vhsub.s32 q0, q0, q1
; CHECK-NEXT:    vsub.i32 q0, q0, q1
; CHECK-NEXT:    vshr.s32 q0, q0, #1
; CHECK-NEXT:    bx lr
entry:
  %0 = sub nsw <4 x i32> %src1, %src2
@@ -103,7 +112,8 @@ entry:
define arm_aapcs_vfpcc <16 x i8> @sub_lshr_v16i8(<16 x i8> %src1, <16 x i8> %src2) {
; CHECK-LABEL: sub_lshr_v16i8:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vhsub.u8 q0, q0, q1
; CHECK-NEXT:    vsub.i8 q0, q0, q1
; CHECK-NEXT:    vshr.u8 q0, q0, #1
; CHECK-NEXT:    bx lr
entry:
  %0 = sub <16 x i8> %src1, %src2
@@ -114,7 +124,8 @@ entry:
define arm_aapcs_vfpcc <8 x i16> @sub_lshr_v8i16(<8 x i16> %src1, <8 x i16> %src2) {
; CHECK-LABEL: sub_lshr_v8i16:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vhsub.u16 q0, q0, q1
; CHECK-NEXT:    vsub.i16 q0, q0, q1
; CHECK-NEXT:    vshr.u16 q0, q0, #1
; CHECK-NEXT:    bx lr
entry:
  %0 = sub <8 x i16> %src1, %src2
@@ -125,7 +136,8 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @sub_lshr_v4i32(<4 x i32> %src1, <4 x i32> %src2) {
; CHECK-LABEL: sub_lshr_v4i32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vhsub.u32 q0, q0, q1
; CHECK-NEXT:    vsub.i32 q0, q0, q1
; CHECK-NEXT:    vshr.u32 q0, q0, #1
; CHECK-NEXT:    bx lr
entry:
  %0 = sub nsw <4 x i32> %src1, %src2
@@ -140,7 +152,8 @@ define arm_aapcs_vfpcc <16 x i8> @add_sdiv_v16i8(<16 x i8> %src1, <16 x i8> %src
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vadd.i8 q0, q0, q1
; CHECK-NEXT:    vshr.u8 q1, q0, #7
; CHECK-NEXT:    vhadd.s8 q0, q0, q1
; CHECK-NEXT:    vadd.i8 q0, q0, q1
; CHECK-NEXT:    vshr.s8 q0, q0, #1
; CHECK-NEXT:    bx lr
entry:
  %0 = add <16 x i8> %src1, %src2
@@ -153,7 +166,8 @@ define arm_aapcs_vfpcc <8 x i16> @add_sdiv_v8i16(<8 x i16> %src1, <8 x i16> %src
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vadd.i16 q0, q0, q1
; CHECK-NEXT:    vshr.u16 q1, q0, #15
; CHECK-NEXT:    vhadd.s16 q0, q0, q1
; CHECK-NEXT:    vadd.i16 q0, q0, q1
; CHECK-NEXT:    vshr.s16 q0, q0, #1
; CHECK-NEXT:    bx lr
entry:
  %0 = add <8 x i16> %src1, %src2
@@ -166,7 +180,8 @@ define arm_aapcs_vfpcc <4 x i32> @add_sdiv_v4i32(<4 x i32> %src1, <4 x i32> %src
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vadd.i32 q0, q0, q1
; CHECK-NEXT:    vshr.u32 q1, q0, #31
; CHECK-NEXT:    vhadd.s32 q0, q0, q1
; CHECK-NEXT:    vadd.i32 q0, q0, q1
; CHECK-NEXT:    vshr.s32 q0, q0, #1
; CHECK-NEXT:    bx lr
entry:
  %0 = add nsw <4 x i32> %src1, %src2
@@ -177,7 +192,8 @@ entry:
define arm_aapcs_vfpcc <16 x i8> @add_udiv_v16i8(<16 x i8> %src1, <16 x i8> %src2) {
; CHECK-LABEL: add_udiv_v16i8:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vhadd.u8 q0, q0, q1
; CHECK-NEXT:    vadd.i8 q0, q0, q1
; CHECK-NEXT:    vshr.u8 q0, q0, #1
; CHECK-NEXT:    bx lr
entry:
  %0 = add <16 x i8> %src1, %src2
@@ -188,7 +204,8 @@ entry:
define arm_aapcs_vfpcc <8 x i16> @add_udiv_v8i16(<8 x i16> %src1, <8 x i16> %src2) {
; CHECK-LABEL: add_udiv_v8i16:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vhadd.u16 q0, q0, q1
; CHECK-NEXT:    vadd.i16 q0, q0, q1
; CHECK-NEXT:    vshr.u16 q0, q0, #1
; CHECK-NEXT:    bx lr
entry:
  %0 = add <8 x i16> %src1, %src2
@@ -199,7 +216,8 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @add_udiv_v4i32(<4 x i32> %src1, <4 x i32> %src2) {
; CHECK-LABEL: add_udiv_v4i32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vhadd.u32 q0, q0, q1
; CHECK-NEXT:    vadd.i32 q0, q0, q1
; CHECK-NEXT:    vshr.u32 q0, q0, #1
; CHECK-NEXT:    bx lr
entry:
  %0 = add nsw <4 x i32> %src1, %src2
@@ -212,7 +230,8 @@ define arm_aapcs_vfpcc <16 x i8> @sub_sdiv_v16i8(<16 x i8> %src1, <16 x i8> %src
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vsub.i8 q0, q0, q1
; CHECK-NEXT:    vshr.u8 q1, q0, #7
; CHECK-NEXT:    vhadd.s8 q0, q0, q1
; CHECK-NEXT:    vadd.i8 q0, q0, q1
; CHECK-NEXT:    vshr.s8 q0, q0, #1
; CHECK-NEXT:    bx lr
entry:
  %0 = sub <16 x i8> %src1, %src2
@@ -225,7 +244,8 @@ define arm_aapcs_vfpcc <8 x i16> @sub_sdiv_v8i16(<8 x i16> %src1, <8 x i16> %src
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vsub.i16 q0, q0, q1
; CHECK-NEXT:    vshr.u16 q1, q0, #15
; CHECK-NEXT:    vhadd.s16 q0, q0, q1
; CHECK-NEXT:    vadd.i16 q0, q0, q1
; CHECK-NEXT:    vshr.s16 q0, q0, #1
; CHECK-NEXT:    bx lr
entry:
  %0 = sub <8 x i16> %src1, %src2
@@ -238,7 +258,8 @@ define arm_aapcs_vfpcc <4 x i32> @sub_sdiv_v4i32(<4 x i32> %src1, <4 x i32> %src
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vsub.i32 q0, q0, q1
; CHECK-NEXT:    vshr.u32 q1, q0, #31
; CHECK-NEXT:    vhadd.s32 q0, q0, q1
; CHECK-NEXT:    vadd.i32 q0, q0, q1
; CHECK-NEXT:    vshr.s32 q0, q0, #1
; CHECK-NEXT:    bx lr
entry:
  %0 = sub nsw <4 x i32> %src1, %src2
@@ -249,7 +270,8 @@ entry:
define arm_aapcs_vfpcc <16 x i8> @sub_udiv_v16i8(<16 x i8> %src1, <16 x i8> %src2) {
; CHECK-LABEL: sub_udiv_v16i8:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vhsub.u8 q0, q0, q1
; CHECK-NEXT:    vsub.i8 q0, q0, q1
; CHECK-NEXT:    vshr.u8 q0, q0, #1
; CHECK-NEXT:    bx lr
entry:
  %0 = sub <16 x i8> %src1, %src2
@@ -260,7 +282,8 @@ entry:
define arm_aapcs_vfpcc <8 x i16> @sub_udiv_v8i16(<8 x i16> %src1, <8 x i16> %src2) {
; CHECK-LABEL: sub_udiv_v8i16:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vhsub.u16 q0, q0, q1
; CHECK-NEXT:    vsub.i16 q0, q0, q1
; CHECK-NEXT:    vshr.u16 q0, q0, #1
; CHECK-NEXT:    bx lr
entry:
  %0 = sub <8 x i16> %src1, %src2
@@ -271,7 +294,8 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @sub_udiv_v4i32(<4 x i32> %src1, <4 x i32> %src2) {
; CHECK-LABEL: sub_udiv_v4i32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vhsub.u32 q0, q0, q1
; CHECK-NEXT:    vsub.i32 q0, q0, q1
; CHECK-NEXT:    vshr.u32 q0, q0, #1
; CHECK-NEXT:    bx lr
entry:
  %0 = sub nsw <4 x i32> %src1, %src2