Commit 9897daa6 authored by Sumanth Gundapaneni's avatar Sumanth Gundapaneni
Browse files

Update LSR's logic that identifies a post-increment SCEV value.

One of the checks has been removed as it seem invalid.
The LoopStep size is always almost a 32-bit.

Differential Revision: https://reviews.llvm.org/D75079
parent d7803c38
Loading
Loading
Loading
Loading
+0 −3
Original line number Diff line number Diff line
@@ -3530,9 +3530,6 @@ static bool mayUsePostIncMode(const TargetTransformInfo &TTI,
  const SCEV *LoopStep = AR->getStepRecurrence(SE);
  if (!isa<SCEVConstant>(LoopStep))
    return false;
  if (LU.AccessTy.getType()->getScalarSizeInBits() !=
      LoopStep->getType()->getScalarSizeInBits())
    return false;
  // Check if a post-indexed load/store can be used.
  if (TTI.isIndexedLoadLegal(TTI.MIM_PostInc, AR->getType()) ||
      TTI.isIndexedStoreLegal(TTI.MIM_PostInc, AR->getType())) {
+1 −1
Original line number Diff line number Diff line
; RUN: llc -march=hexagon < %s | FileCheck %s

; CHECK: [[REG0:(r[0-9]+)]] = add(r29
; CHECK: [[REG1:(r[0-9]+)]] = add([[REG0]],#4)
; CHECK: [[REG1:(r[0-9]+)]] = add([[REG0]],#8)
; CHECK-DAG: memd([[REG1]]+#8) =
; CHECK-DAG: memd([[REG1]]+#0) =

+50 −0
Original line number Diff line number Diff line
; RUN: llc -O3 -march=hexagon < %s | FileCheck %s
; Test to ensure LSR does not optimize out addrec of the outerloop.
; This will help to generate post-increment instructions, otherwise
; it end up an as extra reg+reg add inside the loop.
; CHECK:  loop0(.LBB0_[[LOOP:.]],
; CHECK: .LBB0_[[LOOP]]:
; CHECK: memuh{{.*}}++
; CHECK: endloop


define dso_local signext i16 @foo(i16* nocapture readonly %filt, i16* nocapture readonly %inp, i32 %c1, i32 %c2) local_unnamed_addr {
entry:
  %cmp28 = icmp sgt i32 %c1, 0
  %cmp221 = icmp sgt i32 %c2, 0
  %or.cond = and i1 %cmp28, %cmp221
  br i1 %or.cond, label %for.cond1.preheader.us, label %for.cond.cleanup

for.cond1.preheader.us:                           ; preds = %entry, %for.cond1.for.cond.cleanup3_crit_edge.us
  %filt.addr.032.us = phi i16* [ %scevgep, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ %filt, %entry ]
  %inp.addr.031.us = phi i16* [ %scevgep35, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ %inp, %entry ]
  %l.030.us = phi i32 [ %inc11.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %entry ]
  %sum0.029.us = phi i16 [ %add8.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %entry ]
  %scevgep = getelementptr i16, i16* %filt.addr.032.us, i32 %c2
  br label %for.body4.us

for.body4.us:                                     ; preds = %for.body4.us, %for.cond1.preheader.us
  %z.025.us = phi i32 [ 0, %for.cond1.preheader.us ], [ %inc.us, %for.body4.us ]
  %filt.addr.124.us = phi i16* [ %filt.addr.032.us, %for.cond1.preheader.us ], [ %incdec.ptr.us, %for.body4.us ]
  %inp.addr.123.us = phi i16* [ %inp.addr.031.us, %for.cond1.preheader.us ], [ %incdec.ptr5.us, %for.body4.us ]
  %sum0.122.us = phi i16 [ %sum0.029.us, %for.cond1.preheader.us ], [ %add8.us, %for.body4.us ]
  %incdec.ptr.us = getelementptr inbounds i16, i16* %filt.addr.124.us, i32 1
  %0 = load i16, i16* %filt.addr.124.us, align 2
  %incdec.ptr5.us = getelementptr inbounds i16, i16* %inp.addr.123.us, i32 1
  %1 = load i16, i16* %inp.addr.123.us, align 2
  %add.us = add i16 %0, %sum0.122.us
  %add8.us = add i16 %add.us, %1
  %inc.us = add nuw nsw i32 %z.025.us, 1
  %exitcond = icmp eq i32 %inc.us, %c2
  br i1 %exitcond, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us

for.cond1.for.cond.cleanup3_crit_edge.us:         ; preds = %for.body4.us
  %scevgep35 = getelementptr i16, i16* %inp.addr.031.us, i32 %c2
  %inc11.us = add nuw nsw i32 %l.030.us, 1
  %exitcond36 = icmp eq i32 %inc11.us, %c1
  br i1 %exitcond36, label %for.cond.cleanup, label %for.cond1.preheader.us

for.cond.cleanup:                                 ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %entry
  %sum0.0.lcssa = phi i16 [ 0, %entry ], [ %add8.us, %for.cond1.for.cond.cleanup3_crit_edge.us ]
  ret i16 %sum0.0.lcssa
}
+22 −22
Original line number Diff line number Diff line
@@ -1778,11 +1778,11 @@ for.body: ; preds = %for.body, %for.body
define arm_aapcs_vfpcc float @half_short_mac(half* nocapture readonly %a, i16* nocapture readonly %b, i32 %N) {
; CHECK-LABEL: half_short_mac:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
; CHECK-NEXT:    push {r4, r5, r6, lr}
; CHECK-NEXT:    cbz r2, .LBB11_3
; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
; CHECK-NEXT:    subs r3, r2, #1
; CHECK-NEXT:    and r7, r2, #3
; CHECK-NEXT:    and r6, r2, #3
; CHECK-NEXT:    cmp r3, #3
; CHECK-NEXT:    bhs .LBB11_4
; CHECK-NEXT:  @ %bb.2:
@@ -1799,33 +1799,33 @@ define arm_aapcs_vfpcc float @half_short_mac(half* nocapture readonly %a, i16* n
; CHECK-NEXT:    vldr s0, .LCPI11_0
; CHECK-NEXT:    mov.w r12, #0
; CHECK-NEXT:    add.w lr, r3, r2, lsr #2
; CHECK-NEXT:    movs r3, #0
; CHECK-NEXT:    adds r3, r1, #4
; CHECK-NEXT:    adds r2, r0, #4
; CHECK-NEXT:    dls lr, lr
; CHECK-NEXT:  .LBB11_5: @ %for.body
; CHECK-NEXT:  @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    adds r2, r1, r3
; CHECK-NEXT:    adds r6, r0, r3
; CHECK-NEXT:    vldr.16 s2, [r6, #6]
; CHECK-NEXT:    ldrsh.w r4, [r3, #2]
; CHECK-NEXT:    vldr.16 s2, [r2, #2]
; CHECK-NEXT:    add.w r12, r12, #4
; CHECK-NEXT:    ldrsh.w r4, [r2, #2]
; CHECK-NEXT:    ldrsh.w r5, [r2, #4]
; CHECK-NEXT:    ldrsh.w r2, [r2, #6]
; CHECK-NEXT:    vmov s8, r4
; CHECK-NEXT:    vmov s6, r5
; CHECK-NEXT:    vmov s4, r2
; CHECK-NEXT:    vmov s4, r4
; CHECK-NEXT:    vcvt.f16.s32 s4, s4
; CHECK-NEXT:    ldrsh.w r4, [r3]
; CHECK-NEXT:    vmul.f16 s2, s2, s4
; CHECK-NEXT:    vldr.16 s4, [r6, #4]
; CHECK-NEXT:    vldr.16 s4, [r2]
; CHECK-NEXT:    vmov s6, r4
; CHECK-NEXT:    vcvt.f16.s32 s6, s6
; CHECK-NEXT:    ldrsh r5, [r3, #-2]
; CHECK-NEXT:    ldrsh r4, [r3, #-4]
; CHECK-NEXT:    vmul.f16 s4, s4, s6
; CHECK-NEXT:    vldr.16 s6, [r6, #2]
; CHECK-NEXT:    vldr.16 s6, [r2, #-2]
; CHECK-NEXT:    adds r3, #8
; CHECK-NEXT:    vmov s8, r5
; CHECK-NEXT:    vcvt.f16.s32 s8, s8
; CHECK-NEXT:    ldrsh r2, [r1, r3]
; CHECK-NEXT:    vmov s10, r4
; CHECK-NEXT:    vmul.f16 s6, s6, s8
; CHECK-NEXT:    vldr.16 s8, [r6]
; CHECK-NEXT:    adds r3, #8
; CHECK-NEXT:    vmov s10, r2
; CHECK-NEXT:    vldr.16 s8, [r2, #-4]
; CHECK-NEXT:    vcvt.f16.s32 s10, s10
; CHECK-NEXT:    adds r2, #8
; CHECK-NEXT:    vmul.f16 s8, s8, s10
; CHECK-NEXT:    vcvtb.f32.f16 s8, s8
; CHECK-NEXT:    vcvtb.f32.f16 s6, s6
@@ -1837,11 +1837,11 @@ define arm_aapcs_vfpcc float @half_short_mac(half* nocapture readonly %a, i16* n
; CHECK-NEXT:    vadd.f32 s0, s0, s2
; CHECK-NEXT:    le lr, .LBB11_5
; CHECK-NEXT:  .LBB11_6: @ %for.cond.cleanup.loopexit.unr-lcssa
; CHECK-NEXT:    wls lr, r7, .LBB11_9
; CHECK-NEXT:    wls lr, r6, .LBB11_9
; CHECK-NEXT:  @ %bb.7: @ %for.body.epil.preheader
; CHECK-NEXT:    add.w r0, r0, r12, lsl #1
; CHECK-NEXT:    add.w r1, r1, r12, lsl #1
; CHECK-NEXT:    mov lr, r7
; CHECK-NEXT:    mov lr, r6
; CHECK-NEXT:  .LBB11_8: @ %for.body.epil
; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    ldrsh r2, [r1], #2
@@ -1854,7 +1854,7 @@ define arm_aapcs_vfpcc float @half_short_mac(half* nocapture readonly %a, i16* n
; CHECK-NEXT:    vadd.f32 s0, s0, s2
; CHECK-NEXT:    le lr, .LBB11_8
; CHECK-NEXT:  .LBB11_9: @ %for.cond.cleanup
; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
; CHECK-NEXT:    pop {r4, r5, r6, pc}
; CHECK-NEXT:    .p2align 2
; CHECK-NEXT:  @ %bb.10:
; CHECK-NEXT:  .LCPI11_0:
+94 −90
Original line number Diff line number Diff line
@@ -372,28 +372,28 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_char(i8* nocapture readonly
; CHECK-NEXT:    cmp.w r12, #0
; CHECK-NEXT:    beq.w .LBB5_11
; CHECK-NEXT:  @ %bb.1: @ %for.body.lr.ph
; CHECK-NEXT:    add.w r4, r3, r12, lsl #2
; CHECK-NEXT:    add.w r5, r1, r12
; CHECK-NEXT:    cmp r4, r1
; CHECK-NEXT:    add.w r6, r0, r12
; CHECK-NEXT:    cset r7, hi
; CHECK-NEXT:    cmp r5, r3
; CHECK-NEXT:    cset r5, hi
; CHECK-NEXT:    cmp r4, r0
; CHECK-NEXT:    add.w r6, r3, r12, lsl #2
; CHECK-NEXT:    add.w r4, r1, r12
; CHECK-NEXT:    cmp r6, r1
; CHECK-NEXT:    add.w r5, r0, r12
; CHECK-NEXT:    cset lr, hi
; CHECK-NEXT:    cmp  r4, r3
; CHECK-NEXT:    cset r4, hi
; CHECK-NEXT:    cmp r6, r3
; CHECK-NEXT:    cmp  r6, r0
; CHECK-NEXT:    cset r6, hi
; CHECK-NEXT:    mov.w lr, #1
; CHECK-NEXT:    ands r6, r4
; CHECK-NEXT:    lsls r6, r6, #31
; CHECK-NEXT:    cmp  r5, r3
; CHECK-NEXT:    cset r5, hi
; CHECK-NEXT:    ands r5, r6
; CHECK-NEXT:    movs r6, #1
; CHECK-NEXT:    lsls r5, r5, #31
; CHECK-NEXT:    itt  eq
; CHECK-NEXT:    andeq.w r4, r5, r7
; CHECK-NEXT:    lslseq.w r4, r4, #31
; CHECK-NEXT:    andeq.w r5, r4, lr
; CHECK-NEXT:    lslseq.w r5, r5, #31
; CHECK-NEXT:    beq     .LBB5_4
; CHECK-NEXT:  @ %bb.2: @ %for.body.preheader
; CHECK-NEXT:    sub.w r4, r12, #1
; CHECK-NEXT:   sub.w r5, r12, #1
; CHECK-NEXT:   and r9, r12, #3
; CHECK-NEXT:    cmp r4, #3
; CHECK-NEXT:   cmp r5, #3
; CHECK-NEXT:   bhs .LBB5_6
; CHECK-NEXT:  @ %bb.3:
; CHECK-NEXT:    mov.w r12, #0
@@ -409,33 +409,35 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_char(i8* nocapture readonly
; CHECK-NEXT:    letp lr, .LBB5_5
; CHECK-NEXT:    b .LBB5_11
; CHECK-NEXT:  .LBB5_6: @ %for.body.preheader.new
; CHECK-NEXT:    bic r7, r12, #3
; CHECK-NEXT:    bic r5, r12, #3
; CHECK-NEXT:    add.w r4, r3, #8
; CHECK-NEXT:    subs r7, #4
; CHECK-NEXT:    subs r5, #4
; CHECK-NEXT:    mov.w r12, #0
; CHECK-NEXT:    add.w lr, lr, r7, lsr #2
; CHECK-NEXT:    add.w lr, r6, r5, lsr #2
; CHECK-NEXT:    adds r5, r0, #3
; CHECK-NEXT:    adds r6, r1, #1
; CHECK-NEXT:    dls lr, lr
; CHECK-NEXT:  .LBB5_7: @ %for.body
; CHECK-NEXT:   @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    ldrb.w r5, [r0, r12]
; CHECK-NEXT:    add.w r7, r1, r12
; CHECK-NEXT:    ldrb.w r6, [r1, r12]
; CHECK-NEXT:    smlabb r5, r6, r5, r2
; CHECK-NEXT:    str r5, [r4, #-8]
; CHECK-NEXT:    add.w r5, r0, r12
; CHECK-NEXT:    ldrb r6, [r7, #1]
; CHECK-NEXT:   ldrb r8, [r5, #-3]
; CHECK-NEXT:   add.w r12, r12, #4
; CHECK-NEXT:    ldrb.w r8, [r5, #1]
; CHECK-NEXT:    smlabb r6, r6, r8, r2
; CHECK-NEXT:    str r6, [r4, #-4]
; CHECK-NEXT:    ldrb.w r8, [r5, #2]
; CHECK-NEXT:    ldrb r6, [r7, #2]
; CHECK-NEXT:    smlabb r6, r6, r8, r2
; CHECK-NEXT:    str r6, [r4]
; CHECK-NEXT:    ldrb r5, [r5, #3]
; CHECK-NEXT:    ldrb r6, [r7, #3]
; CHECK-NEXT:    smlabb r5, r6, r5, r2
; CHECK-NEXT:    str r5, [r4, #4]
; CHECK-NEXT:   ldrb r7, [r6, #-1]
; CHECK-NEXT:   smlabb r7, r7, r8, r2
; CHECK-NEXT:   str r7, [r4, #-8]
; CHECK-NEXT:   ldrb r8, [r5, #-2]
; CHECK-NEXT:   ldrb r7, [r6]
; CHECK-NEXT:   smlabb r7, r7, r8, r2
; CHECK-NEXT:   str r7, [r4, #-4]
; CHECK-NEXT:   ldrb r8, [r5, #-1]
; CHECK-NEXT:   ldrb r7, [r6, #1]
; CHECK-NEXT:   smlabb r7, r7, r8, r2
; CHECK-NEXT:   str r7, [r4]
; CHECK-NEXT:   ldrb.w r8, [r5]
; CHECK-NEXT:   adds r5, #4
; CHECK-NEXT:   ldrb r7, [r6, #2]
; CHECK-NEXT:   adds r6, #4
; CHECK-NEXT:   smlabb r7, r7, r8, r2
; CHECK-NEXT:   str r7, [r4, #4]
; CHECK-NEXT:   adds r4, #16
; CHECK-NEXT:   le lr, .LBB5_7
; CHECK-NEXT:  .LBB5_8: @ %for.cond.cleanup.loopexit.unr-lcssa
@@ -447,10 +449,10 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_char(i8* nocapture readonly
; CHECK-NEXT:    mov lr, r9
; CHECK-NEXT:  .LBB5_10: @ %for.body.epil
; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    ldrb r7, [r0], #1
; CHECK-NEXT:    ldrb r6, [r1], #1
; CHECK-NEXT:    smlabb r7, r6, r7, r2
; CHECK-NEXT:    str r7, [r3], #4
; CHECK-NEXT:    ldrb r6, [r0], #1
; CHECK-NEXT:    ldrb r5, [r1], #1
; CHECK-NEXT:    smlabb r6, r5, r6, r2
; CHECK-NEXT:    str r6, [r3], #4
; CHECK-NEXT:    le lr, .LBB5_10
; CHECK-NEXT:  .LBB5_11: @ %for.cond.cleanup
; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, pc}
@@ -663,28 +665,28 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_uchar(i8* nocapture readonl
; CHECK-NEXT:    cmp.w r12, #0
; CHECK-NEXT:    beq.w .LBB7_11
; CHECK-NEXT:  @ %bb.1: @ %for.body.lr.ph
; CHECK-NEXT:    add.w r4, r3, r12, lsl #2
; CHECK-NEXT:    add.w r5, r1, r12
; CHECK-NEXT:    cmp r4, r1
; CHECK-NEXT:    add.w r6, r0, r12
; CHECK-NEXT:    cset r7, hi
; CHECK-NEXT:    cmp r5, r3
; CHECK-NEXT:    cset r5, hi
; CHECK-NEXT:    cmp r4, r0
; CHECK-NEXT:    add.w r6, r3, r12, lsl #2
; CHECK-NEXT:    add.w r4, r1, r12
; CHECK-NEXT:    cmp r6, r1
; CHECK-NEXT:    add.w r5, r0, r12
; CHECK-NEXT:    cset lr, hi
; CHECK-NEXT:    cmp r4, r3
; CHECK-NEXT:    cset r4, hi
; CHECK-NEXT:    cmp r6, r3
; CHECK-NEXT:    cmp r6, r0
; CHECK-NEXT:    cset r6, hi
; CHECK-NEXT:    mov.w lr, #1
; CHECK-NEXT:    ands r6, r4
; CHECK-NEXT:    lsls r6, r6, #31
; CHECK-NEXT:    cmp r5, r3
; CHECK-NEXT:    cset r5, hi
; CHECK-NEXT:    ands r5, r6
; CHECK-NEXT:    movs r6, #1
; CHECK-NEXT:    lsls r5, r5, #31
; CHECK-NEXT:    itt eq
; CHECK-NEXT:    andeq.w r4, r5, r7
; CHECK-NEXT:    lslseq.w r4, r4, #31
; CHECK-NEXT:    andeq.w r5, r4, lr
; CHECK-NEXT:    lslseq.w r5, r5, #31
; CHECK-NEXT:    beq .LBB7_4
; CHECK-NEXT:  @ %bb.2: @ %for.body.preheader
; CHECK-NEXT:    sub.w r4, r12, #1
; CHECK-NEXT:    sub.w r5, r12, #1
; CHECK-NEXT:    and r9, r12, #3
; CHECK-NEXT:    cmp r4, #3
; CHECK-NEXT:    cmp r5, #3
; CHECK-NEXT:    bhs .LBB7_6
; CHECK-NEXT:  @ %bb.3:
; CHECK-NEXT:    mov.w r12, #0
@@ -700,33 +702,35 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_uchar(i8* nocapture readonl
; CHECK-NEXT:    letp lr, .LBB7_5
; CHECK-NEXT:    b .LBB7_11
; CHECK-NEXT:  .LBB7_6: @ %for.body.preheader.new
; CHECK-NEXT:    bic r7, r12, #3
; CHECK-NEXT:    bic r5, r12, #3
; CHECK-NEXT:    add.w r4, r3, #8
; CHECK-NEXT:    subs r7, #4
; CHECK-NEXT:    subs r5, #4
; CHECK-NEXT:    mov.w r12, #0
; CHECK-NEXT:    add.w lr, lr, r7, lsr #2
; CHECK-NEXT:    add.w lr, r6, r5, lsr #2
; CHECK-NEXT:    adds r5, r0, #3
; CHECK-NEXT:    adds r6, r1, #1
; CHECK-NEXT:    dls lr, lr
; CHECK-NEXT:  .LBB7_7: @ %for.body
; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    ldrb.w r5, [r0, r12]
; CHECK-NEXT:    add.w r7, r1, r12
; CHECK-NEXT:    ldrb.w r6, [r1, r12]
; CHECK-NEXT:    smlabb r5, r6, r5, r2
; CHECK-NEXT:    str r5, [r4, #-8]
; CHECK-NEXT:    add.w r5, r0, r12
; CHECK-NEXT:    ldrb r6, [r7, #1]
; CHECK-NEXT:    ldrb r8, [r5, #-3]
; CHECK-NEXT:    add.w r12, r12, #4
; CHECK-NEXT:    ldrb.w r8, [r5, #1]
; CHECK-NEXT:    smlabb r6, r6, r8, r2
; CHECK-NEXT:    str r6, [r4, #-4]
; CHECK-NEXT:    ldrb.w r8, [r5, #2]
; CHECK-NEXT:    ldrb r6, [r7, #2]
; CHECK-NEXT:    smlabb r6, r6, r8, r2
; CHECK-NEXT:    str r6, [r4]
; CHECK-NEXT:    ldrb r5, [r5, #3]
; CHECK-NEXT:    ldrb r6, [r7, #3]
; CHECK-NEXT:    smlabb r5, r6, r5, r2
; CHECK-NEXT:    str r5, [r4, #4]
; CHECK-NEXT:    ldrb r7, [r6, #-1]
; CHECK-NEXT:    smlabb r7, r7, r8, r2
; CHECK-NEXT:    str r7, [r4, #-8]
; CHECK-NEXT:    ldrb r8, [r5, #-2]
; CHECK-NEXT:    ldrb r7, [r6]
; CHECK-NEXT:    smlabb r7, r7, r8, r2
; CHECK-NEXT:    str r7, [r4, #-4]
; CHECK-NEXT:    ldrb r8, [r5, #-1]
; CHECK-NEXT:    ldrb r7, [r6, #1]
; CHECK-NEXT:    smlabb r7, r7, r8, r2
; CHECK-NEXT:    str r7, [r4]
; CHECK-NEXT:    ldrb.w r8, [r5]
; CHECK-NEXT:    adds r5, #4
; CHECK-NEXT:    ldrb r7, [r6, #2]
; CHECK-NEXT:    adds r6, #4
; CHECK-NEXT:    smlabb r7, r7, r8, r2
; CHECK-NEXT:    str r7, [r4, #4]
; CHECK-NEXT:    adds r4, #16
; CHECK-NEXT:    le lr, .LBB7_7
; CHECK-NEXT:  .LBB7_8: @ %for.cond.cleanup.loopexit.unr-lcssa
@@ -738,10 +742,10 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_uchar(i8* nocapture readonl
; CHECK-NEXT:    mov lr, r9
; CHECK-NEXT:  .LBB7_10: @ %for.body.epil
; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    ldrb r7, [r0], #1
; CHECK-NEXT:    ldrb r6, [r1], #1
; CHECK-NEXT:    smlabb r7, r6, r7, r2
; CHECK-NEXT:    str r7, [r3], #4
; CHECK-NEXT:    ldrb r6, [r0], #1
; CHECK-NEXT:    ldrb r5, [r1], #1
; CHECK-NEXT:    smlabb r6, r5, r6, r2
; CHECK-NEXT:    str r6, [r3], #4
; CHECK-NEXT:    le lr, .LBB7_10
; CHECK-NEXT:  .LBB7_11: @ %for.cond.cleanup
; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, pc}