Loading llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +0 −3 Original line number Diff line number Diff line Loading @@ -3530,9 +3530,6 @@ static bool mayUsePostIncMode(const TargetTransformInfo &TTI, const SCEV *LoopStep = AR->getStepRecurrence(SE); if (!isa<SCEVConstant>(LoopStep)) return false; if (LU.AccessTy.getType()->getScalarSizeInBits() != LoopStep->getType()->getScalarSizeInBits()) return false; // Check if a post-indexed load/store can be used. if (TTI.isIndexedLoadLegal(TTI.MIM_PostInc, AR->getType()) || TTI.isIndexedStoreLegal(TTI.MIM_PostInc, AR->getType())) { Loading llvm/test/CodeGen/Hexagon/addrmode-align.ll +1 −1 Original line number Diff line number Diff line ; RUN: llc -march=hexagon < %s | FileCheck %s ; CHECK: [[REG0:(r[0-9]+)]] = add(r29 ; CHECK: [[REG1:(r[0-9]+)]] = add([[REG0]],#4) ; CHECK: [[REG1:(r[0-9]+)]] = add([[REG0]],#8) ; CHECK-DAG: memd([[REG1]]+#8) = ; CHECK-DAG: memd([[REG1]]+#0) = Loading llvm/test/CodeGen/Hexagon/lsr-postinc-nested-loop.ll 0 → 100644 +50 −0 Original line number Diff line number Diff line ; RUN: llc -O3 -march=hexagon < %s | FileCheck %s ; Test to ensure LSR does not optimize out addrec of the outerloop. ; This will help to generate post-increment instructions, otherwise ; it end up an as extra reg+reg add inside the loop. ; CHECK: loop0(.LBB0_[[LOOP:.]], ; CHECK: .LBB0_[[LOOP]]: ; CHECK: memuh{{.*}}++ ; CHECK: endloop define dso_local signext i16 @foo(i16* nocapture readonly %filt, i16* nocapture readonly %inp, i32 %c1, i32 %c2) local_unnamed_addr { entry: %cmp28 = icmp sgt i32 %c1, 0 %cmp221 = icmp sgt i32 %c2, 0 %or.cond = and i1 %cmp28, %cmp221 br i1 %or.cond, label %for.cond1.preheader.us, label %for.cond.cleanup for.cond1.preheader.us: ; preds = %entry, %for.cond1.for.cond.cleanup3_crit_edge.us %filt.addr.032.us = phi i16* [ %scevgep, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ %filt, %entry ] %inp.addr.031.us = phi i16* [ %scevgep35, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ %inp, %entry ] %l.030.us = phi i32 [ %inc11.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %entry ] %sum0.029.us = phi i16 [ %add8.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %entry ] %scevgep = getelementptr i16, i16* %filt.addr.032.us, i32 %c2 br label %for.body4.us for.body4.us: ; preds = %for.body4.us, %for.cond1.preheader.us %z.025.us = phi i32 [ 0, %for.cond1.preheader.us ], [ %inc.us, %for.body4.us ] %filt.addr.124.us = phi i16* [ %filt.addr.032.us, %for.cond1.preheader.us ], [ %incdec.ptr.us, %for.body4.us ] %inp.addr.123.us = phi i16* [ %inp.addr.031.us, %for.cond1.preheader.us ], [ %incdec.ptr5.us, %for.body4.us ] %sum0.122.us = phi i16 [ %sum0.029.us, %for.cond1.preheader.us ], [ %add8.us, %for.body4.us ] %incdec.ptr.us = getelementptr inbounds i16, i16* %filt.addr.124.us, i32 1 %0 = load i16, i16* %filt.addr.124.us, align 2 %incdec.ptr5.us = getelementptr inbounds i16, i16* %inp.addr.123.us, i32 1 %1 = load i16, i16* %inp.addr.123.us, align 2 %add.us = add i16 %0, %sum0.122.us %add8.us = add i16 %add.us, %1 %inc.us = add nuw nsw i32 %z.025.us, 1 %exitcond = icmp eq i32 %inc.us, %c2 br i1 %exitcond, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us for.cond1.for.cond.cleanup3_crit_edge.us: ; preds = %for.body4.us %scevgep35 = getelementptr i16, i16* %inp.addr.031.us, i32 %c2 %inc11.us = add nuw nsw i32 %l.030.us, 1 %exitcond36 = icmp eq i32 %inc11.us, %c1 br i1 %exitcond36, label %for.cond.cleanup, label %for.cond1.preheader.us for.cond.cleanup: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %entry %sum0.0.lcssa = phi i16 [ 0, %entry ], [ %add8.us, %for.cond1.for.cond.cleanup3_crit_edge.us ] ret i16 %sum0.0.lcssa } llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll +22 −22 Original line number Diff line number Diff line Loading @@ -1778,11 +1778,11 @@ for.body: ; preds = %for.body, %for.body define arm_aapcs_vfpcc float @half_short_mac(half* nocapture readonly %a, i16* nocapture readonly %b, i32 %N) { ; CHECK-LABEL: half_short_mac: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: push {r4, r5, r6, r7, lr} ; CHECK-NEXT: push {r4, r5, r6, lr} ; CHECK-NEXT: cbz r2, .LBB11_3 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader ; CHECK-NEXT: subs r3, r2, #1 ; CHECK-NEXT: and r7, r2, #3 ; CHECK-NEXT: and r6, r2, #3 ; CHECK-NEXT: cmp r3, #3 ; CHECK-NEXT: bhs .LBB11_4 ; CHECK-NEXT: @ %bb.2: Loading @@ -1799,33 +1799,33 @@ define arm_aapcs_vfpcc float @half_short_mac(half* nocapture readonly %a, i16* n ; CHECK-NEXT: vldr s0, .LCPI11_0 ; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: add.w lr, r3, r2, lsr #2 ; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: adds r3, r1, #4 ; CHECK-NEXT: adds r2, r0, #4 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB11_5: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: adds r2, r1, r3 ; CHECK-NEXT: adds r6, r0, r3 ; CHECK-NEXT: vldr.16 s2, [r6, #6] ; CHECK-NEXT: ldrsh.w r4, [r3, #2] ; CHECK-NEXT: vldr.16 s2, [r2, #2] ; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: ldrsh.w r4, [r2, #2] ; CHECK-NEXT: ldrsh.w r5, [r2, #4] ; CHECK-NEXT: ldrsh.w r2, [r2, #6] ; CHECK-NEXT: vmov s8, r4 ; CHECK-NEXT: vmov s6, r5 ; CHECK-NEXT: vmov s4, r2 ; CHECK-NEXT: vmov s4, r4 ; CHECK-NEXT: vcvt.f16.s32 s4, s4 ; CHECK-NEXT: ldrsh.w r4, [r3] ; CHECK-NEXT: vmul.f16 s2, s2, s4 ; CHECK-NEXT: vldr.16 s4, [r6, #4] ; CHECK-NEXT: vldr.16 s4, [r2] ; CHECK-NEXT: vmov s6, r4 ; CHECK-NEXT: vcvt.f16.s32 s6, s6 ; CHECK-NEXT: ldrsh r5, [r3, #-2] ; CHECK-NEXT: ldrsh r4, [r3, #-4] ; CHECK-NEXT: vmul.f16 s4, s4, s6 ; CHECK-NEXT: vldr.16 s6, [r6, #2] ; CHECK-NEXT: vldr.16 s6, [r2, #-2] ; CHECK-NEXT: adds r3, #8 ; CHECK-NEXT: vmov s8, r5 ; CHECK-NEXT: vcvt.f16.s32 s8, s8 ; CHECK-NEXT: ldrsh r2, [r1, r3] ; CHECK-NEXT: vmov s10, r4 ; CHECK-NEXT: vmul.f16 s6, s6, s8 ; CHECK-NEXT: vldr.16 s8, [r6] ; CHECK-NEXT: adds r3, #8 ; CHECK-NEXT: vmov s10, r2 ; CHECK-NEXT: vldr.16 s8, [r2, #-4] ; CHECK-NEXT: vcvt.f16.s32 s10, s10 ; CHECK-NEXT: adds r2, #8 ; CHECK-NEXT: vmul.f16 s8, s8, s10 ; CHECK-NEXT: vcvtb.f32.f16 s8, s8 ; CHECK-NEXT: vcvtb.f32.f16 s6, s6 Loading @@ -1837,11 +1837,11 @@ define arm_aapcs_vfpcc float @half_short_mac(half* nocapture readonly %a, i16* n ; CHECK-NEXT: vadd.f32 s0, s0, s2 ; CHECK-NEXT: le lr, .LBB11_5 ; CHECK-NEXT: .LBB11_6: @ %for.cond.cleanup.loopexit.unr-lcssa ; CHECK-NEXT: wls lr, r7, .LBB11_9 ; CHECK-NEXT: wls lr, r6, .LBB11_9 ; CHECK-NEXT: @ %bb.7: @ %for.body.epil.preheader ; CHECK-NEXT: add.w r0, r0, r12, lsl #1 ; CHECK-NEXT: add.w r1, r1, r12, lsl #1 ; CHECK-NEXT: mov lr, r7 ; CHECK-NEXT: mov lr, r6 ; CHECK-NEXT: .LBB11_8: @ %for.body.epil ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrsh r2, [r1], #2 Loading @@ -1854,7 +1854,7 @@ define arm_aapcs_vfpcc float @half_short_mac(half* nocapture readonly %a, i16* n ; CHECK-NEXT: vadd.f32 s0, s0, s2 ; CHECK-NEXT: le lr, .LBB11_8 ; CHECK-NEXT: .LBB11_9: @ %for.cond.cleanup ; CHECK-NEXT: pop {r4, r5, r6, r7, pc} ; CHECK-NEXT: pop {r4, r5, r6, pc} ; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: @ %bb.10: ; CHECK-NEXT: .LCPI11_0: Loading llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll +94 −90 Original line number Diff line number Diff line Loading @@ -372,28 +372,28 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_char(i8* nocapture readonly ; CHECK-NEXT: cmp.w r12, #0 ; CHECK-NEXT: beq.w .LBB5_11 ; CHECK-NEXT: @ %bb.1: @ %for.body.lr.ph ; CHECK-NEXT: add.w r4, r3, r12, lsl #2 ; CHECK-NEXT: add.w r5, r1, r12 ; CHECK-NEXT: cmp r4, r1 ; CHECK-NEXT: add.w r6, r0, r12 ; CHECK-NEXT: cset r7, hi ; CHECK-NEXT: cmp r5, r3 ; CHECK-NEXT: cset r5, hi ; CHECK-NEXT: cmp r4, r0 ; CHECK-NEXT: add.w r6, r3, r12, lsl #2 ; CHECK-NEXT: add.w r4, r1, r12 ; CHECK-NEXT: cmp r6, r1 ; CHECK-NEXT: add.w r5, r0, r12 ; CHECK-NEXT: cset lr, hi ; CHECK-NEXT: cmp r4, r3 ; CHECK-NEXT: cset r4, hi ; CHECK-NEXT: cmp r6, r3 ; CHECK-NEXT: cmp r6, r0 ; CHECK-NEXT: cset r6, hi ; CHECK-NEXT: mov.w lr, #1 ; CHECK-NEXT: ands r6, r4 ; CHECK-NEXT: lsls r6, r6, #31 ; CHECK-NEXT: cmp r5, r3 ; CHECK-NEXT: cset r5, hi ; CHECK-NEXT: ands r5, r6 ; CHECK-NEXT: movs r6, #1 ; CHECK-NEXT: lsls r5, r5, #31 ; CHECK-NEXT: itt eq ; CHECK-NEXT: andeq.w r4, r5, r7 ; CHECK-NEXT: lslseq.w r4, r4, #31 ; CHECK-NEXT: andeq.w r5, r4, lr ; CHECK-NEXT: lslseq.w r5, r5, #31 ; CHECK-NEXT: beq .LBB5_4 ; CHECK-NEXT: @ %bb.2: @ %for.body.preheader ; CHECK-NEXT: sub.w r4, r12, #1 ; CHECK-NEXT: sub.w r5, r12, #1 ; CHECK-NEXT: and r9, r12, #3 ; CHECK-NEXT: cmp r4, #3 ; CHECK-NEXT: cmp r5, #3 ; CHECK-NEXT: bhs .LBB5_6 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: mov.w r12, #0 Loading @@ -409,33 +409,35 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_char(i8* nocapture readonly ; CHECK-NEXT: letp lr, .LBB5_5 ; CHECK-NEXT: b .LBB5_11 ; CHECK-NEXT: .LBB5_6: @ %for.body.preheader.new ; CHECK-NEXT: bic r7, r12, #3 ; CHECK-NEXT: bic r5, r12, #3 ; CHECK-NEXT: add.w r4, r3, #8 ; CHECK-NEXT: subs r7, #4 ; CHECK-NEXT: subs r5, #4 ; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: add.w lr, lr, r7, lsr #2 ; CHECK-NEXT: add.w lr, r6, r5, lsr #2 ; CHECK-NEXT: adds r5, r0, #3 ; CHECK-NEXT: adds r6, r1, #1 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB5_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrb.w r5, [r0, r12] ; CHECK-NEXT: add.w r7, r1, r12 ; CHECK-NEXT: ldrb.w r6, [r1, r12] ; CHECK-NEXT: smlabb r5, r6, r5, r2 ; CHECK-NEXT: str r5, [r4, #-8] ; CHECK-NEXT: add.w r5, r0, r12 ; CHECK-NEXT: ldrb r6, [r7, #1] ; CHECK-NEXT: ldrb r8, [r5, #-3] ; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: ldrb.w r8, [r5, #1] ; CHECK-NEXT: smlabb r6, r6, r8, r2 ; CHECK-NEXT: str r6, [r4, #-4] ; CHECK-NEXT: ldrb.w r8, [r5, #2] ; CHECK-NEXT: ldrb r6, [r7, #2] ; CHECK-NEXT: smlabb r6, r6, r8, r2 ; CHECK-NEXT: str r6, [r4] ; CHECK-NEXT: ldrb r5, [r5, #3] ; CHECK-NEXT: ldrb r6, [r7, #3] ; CHECK-NEXT: smlabb r5, r6, r5, r2 ; CHECK-NEXT: str r5, [r4, #4] ; CHECK-NEXT: ldrb r7, [r6, #-1] ; CHECK-NEXT: smlabb r7, r7, r8, r2 ; CHECK-NEXT: str r7, [r4, #-8] ; CHECK-NEXT: ldrb r8, [r5, #-2] ; CHECK-NEXT: ldrb r7, [r6] ; CHECK-NEXT: smlabb r7, r7, r8, r2 ; CHECK-NEXT: str r7, [r4, #-4] ; CHECK-NEXT: ldrb r8, [r5, #-1] ; CHECK-NEXT: ldrb r7, [r6, #1] ; CHECK-NEXT: smlabb r7, r7, r8, r2 ; CHECK-NEXT: str r7, [r4] ; CHECK-NEXT: ldrb.w r8, [r5] ; CHECK-NEXT: adds r5, #4 ; CHECK-NEXT: ldrb r7, [r6, #2] ; CHECK-NEXT: adds r6, #4 ; CHECK-NEXT: smlabb r7, r7, r8, r2 ; CHECK-NEXT: str r7, [r4, #4] ; CHECK-NEXT: adds r4, #16 ; CHECK-NEXT: le lr, .LBB5_7 ; CHECK-NEXT: .LBB5_8: @ %for.cond.cleanup.loopexit.unr-lcssa Loading @@ -447,10 +449,10 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_char(i8* nocapture readonly ; CHECK-NEXT: mov lr, r9 ; CHECK-NEXT: .LBB5_10: @ %for.body.epil ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrb r7, [r0], #1 ; CHECK-NEXT: ldrb r6, [r1], #1 ; CHECK-NEXT: smlabb r7, r6, r7, r2 ; CHECK-NEXT: str r7, [r3], #4 ; CHECK-NEXT: ldrb r6, [r0], #1 ; CHECK-NEXT: ldrb r5, [r1], #1 ; CHECK-NEXT: smlabb r6, r5, r6, r2 ; CHECK-NEXT: str r6, [r3], #4 ; CHECK-NEXT: le lr, .LBB5_10 ; CHECK-NEXT: .LBB5_11: @ %for.cond.cleanup ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} Loading Loading @@ -663,28 +665,28 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_uchar(i8* nocapture readonl ; CHECK-NEXT: cmp.w r12, #0 ; CHECK-NEXT: beq.w .LBB7_11 ; CHECK-NEXT: @ %bb.1: @ %for.body.lr.ph ; CHECK-NEXT: add.w r4, r3, r12, lsl #2 ; CHECK-NEXT: add.w r5, r1, r12 ; CHECK-NEXT: cmp r4, r1 ; CHECK-NEXT: add.w r6, r0, r12 ; CHECK-NEXT: cset r7, hi ; CHECK-NEXT: cmp r5, r3 ; CHECK-NEXT: cset r5, hi ; CHECK-NEXT: cmp r4, r0 ; CHECK-NEXT: add.w r6, r3, r12, lsl #2 ; CHECK-NEXT: add.w r4, r1, r12 ; CHECK-NEXT: cmp r6, r1 ; CHECK-NEXT: add.w r5, r0, r12 ; CHECK-NEXT: cset lr, hi ; CHECK-NEXT: cmp r4, r3 ; CHECK-NEXT: cset r4, hi ; CHECK-NEXT: cmp r6, r3 ; CHECK-NEXT: cmp r6, r0 ; CHECK-NEXT: cset r6, hi ; CHECK-NEXT: mov.w lr, #1 ; CHECK-NEXT: ands r6, r4 ; CHECK-NEXT: lsls r6, r6, #31 ; CHECK-NEXT: cmp r5, r3 ; CHECK-NEXT: cset r5, hi ; CHECK-NEXT: ands r5, r6 ; CHECK-NEXT: movs r6, #1 ; CHECK-NEXT: lsls r5, r5, #31 ; CHECK-NEXT: itt eq ; CHECK-NEXT: andeq.w r4, r5, r7 ; CHECK-NEXT: lslseq.w r4, r4, #31 ; CHECK-NEXT: andeq.w r5, r4, lr ; CHECK-NEXT: lslseq.w r5, r5, #31 ; CHECK-NEXT: beq .LBB7_4 ; CHECK-NEXT: @ %bb.2: @ %for.body.preheader ; CHECK-NEXT: sub.w r4, r12, #1 ; CHECK-NEXT: sub.w r5, r12, #1 ; CHECK-NEXT: and r9, r12, #3 ; CHECK-NEXT: cmp r4, #3 ; CHECK-NEXT: cmp r5, #3 ; CHECK-NEXT: bhs .LBB7_6 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: mov.w r12, #0 Loading @@ -700,33 +702,35 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_uchar(i8* nocapture readonl ; CHECK-NEXT: letp lr, .LBB7_5 ; CHECK-NEXT: b .LBB7_11 ; CHECK-NEXT: .LBB7_6: @ %for.body.preheader.new ; CHECK-NEXT: bic r7, r12, #3 ; CHECK-NEXT: bic r5, r12, #3 ; CHECK-NEXT: add.w r4, r3, #8 ; CHECK-NEXT: subs r7, #4 ; CHECK-NEXT: subs r5, #4 ; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: add.w lr, lr, r7, lsr #2 ; CHECK-NEXT: add.w lr, r6, r5, lsr #2 ; CHECK-NEXT: adds r5, r0, #3 ; CHECK-NEXT: adds r6, r1, #1 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB7_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrb.w r5, [r0, r12] ; CHECK-NEXT: add.w r7, r1, r12 ; CHECK-NEXT: ldrb.w r6, [r1, r12] ; CHECK-NEXT: smlabb r5, r6, r5, r2 ; CHECK-NEXT: str r5, [r4, #-8] ; CHECK-NEXT: add.w r5, r0, r12 ; CHECK-NEXT: ldrb r6, [r7, #1] ; CHECK-NEXT: ldrb r8, [r5, #-3] ; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: ldrb.w r8, [r5, #1] ; CHECK-NEXT: smlabb r6, r6, r8, r2 ; CHECK-NEXT: str r6, [r4, #-4] ; CHECK-NEXT: ldrb.w r8, [r5, #2] ; CHECK-NEXT: ldrb r6, [r7, #2] ; CHECK-NEXT: smlabb r6, r6, r8, r2 ; CHECK-NEXT: str r6, [r4] ; CHECK-NEXT: ldrb r5, [r5, #3] ; CHECK-NEXT: ldrb r6, [r7, #3] ; CHECK-NEXT: smlabb r5, r6, r5, r2 ; CHECK-NEXT: str r5, [r4, #4] ; CHECK-NEXT: ldrb r7, [r6, #-1] ; CHECK-NEXT: smlabb r7, r7, r8, r2 ; CHECK-NEXT: str r7, [r4, #-8] ; CHECK-NEXT: ldrb r8, [r5, #-2] ; CHECK-NEXT: ldrb r7, [r6] ; CHECK-NEXT: smlabb r7, r7, r8, r2 ; CHECK-NEXT: str r7, [r4, #-4] ; CHECK-NEXT: ldrb r8, [r5, #-1] ; CHECK-NEXT: ldrb r7, [r6, #1] ; CHECK-NEXT: smlabb r7, r7, r8, r2 ; CHECK-NEXT: str r7, [r4] ; CHECK-NEXT: ldrb.w r8, [r5] ; CHECK-NEXT: adds r5, #4 ; CHECK-NEXT: ldrb r7, [r6, #2] ; CHECK-NEXT: adds r6, #4 ; CHECK-NEXT: smlabb r7, r7, r8, r2 ; CHECK-NEXT: str r7, [r4, #4] ; CHECK-NEXT: adds r4, #16 ; CHECK-NEXT: le lr, .LBB7_7 ; CHECK-NEXT: .LBB7_8: @ %for.cond.cleanup.loopexit.unr-lcssa Loading @@ -738,10 +742,10 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_uchar(i8* nocapture readonl ; CHECK-NEXT: mov lr, r9 ; CHECK-NEXT: .LBB7_10: @ %for.body.epil ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrb r7, [r0], #1 ; CHECK-NEXT: ldrb r6, [r1], #1 ; CHECK-NEXT: smlabb r7, r6, r7, r2 ; CHECK-NEXT: str r7, [r3], #4 ; CHECK-NEXT: ldrb r6, [r0], #1 ; CHECK-NEXT: ldrb r5, [r1], #1 ; CHECK-NEXT: smlabb r6, r5, r6, r2 ; CHECK-NEXT: str r6, [r3], #4 ; CHECK-NEXT: le lr, .LBB7_10 ; CHECK-NEXT: .LBB7_11: @ %for.cond.cleanup ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} Loading Loading
llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +0 −3 Original line number Diff line number Diff line Loading @@ -3530,9 +3530,6 @@ static bool mayUsePostIncMode(const TargetTransformInfo &TTI, const SCEV *LoopStep = AR->getStepRecurrence(SE); if (!isa<SCEVConstant>(LoopStep)) return false; if (LU.AccessTy.getType()->getScalarSizeInBits() != LoopStep->getType()->getScalarSizeInBits()) return false; // Check if a post-indexed load/store can be used. if (TTI.isIndexedLoadLegal(TTI.MIM_PostInc, AR->getType()) || TTI.isIndexedStoreLegal(TTI.MIM_PostInc, AR->getType())) { Loading
llvm/test/CodeGen/Hexagon/addrmode-align.ll +1 −1 Original line number Diff line number Diff line ; RUN: llc -march=hexagon < %s | FileCheck %s ; CHECK: [[REG0:(r[0-9]+)]] = add(r29 ; CHECK: [[REG1:(r[0-9]+)]] = add([[REG0]],#4) ; CHECK: [[REG1:(r[0-9]+)]] = add([[REG0]],#8) ; CHECK-DAG: memd([[REG1]]+#8) = ; CHECK-DAG: memd([[REG1]]+#0) = Loading
llvm/test/CodeGen/Hexagon/lsr-postinc-nested-loop.ll 0 → 100644 +50 −0 Original line number Diff line number Diff line ; RUN: llc -O3 -march=hexagon < %s | FileCheck %s ; Test to ensure LSR does not optimize out addrec of the outerloop. ; This will help to generate post-increment instructions, otherwise ; it end up an as extra reg+reg add inside the loop. ; CHECK: loop0(.LBB0_[[LOOP:.]], ; CHECK: .LBB0_[[LOOP]]: ; CHECK: memuh{{.*}}++ ; CHECK: endloop define dso_local signext i16 @foo(i16* nocapture readonly %filt, i16* nocapture readonly %inp, i32 %c1, i32 %c2) local_unnamed_addr { entry: %cmp28 = icmp sgt i32 %c1, 0 %cmp221 = icmp sgt i32 %c2, 0 %or.cond = and i1 %cmp28, %cmp221 br i1 %or.cond, label %for.cond1.preheader.us, label %for.cond.cleanup for.cond1.preheader.us: ; preds = %entry, %for.cond1.for.cond.cleanup3_crit_edge.us %filt.addr.032.us = phi i16* [ %scevgep, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ %filt, %entry ] %inp.addr.031.us = phi i16* [ %scevgep35, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ %inp, %entry ] %l.030.us = phi i32 [ %inc11.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %entry ] %sum0.029.us = phi i16 [ %add8.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %entry ] %scevgep = getelementptr i16, i16* %filt.addr.032.us, i32 %c2 br label %for.body4.us for.body4.us: ; preds = %for.body4.us, %for.cond1.preheader.us %z.025.us = phi i32 [ 0, %for.cond1.preheader.us ], [ %inc.us, %for.body4.us ] %filt.addr.124.us = phi i16* [ %filt.addr.032.us, %for.cond1.preheader.us ], [ %incdec.ptr.us, %for.body4.us ] %inp.addr.123.us = phi i16* [ %inp.addr.031.us, %for.cond1.preheader.us ], [ %incdec.ptr5.us, %for.body4.us ] %sum0.122.us = phi i16 [ %sum0.029.us, %for.cond1.preheader.us ], [ %add8.us, %for.body4.us ] %incdec.ptr.us = getelementptr inbounds i16, i16* %filt.addr.124.us, i32 1 %0 = load i16, i16* %filt.addr.124.us, align 2 %incdec.ptr5.us = getelementptr inbounds i16, i16* %inp.addr.123.us, i32 1 %1 = load i16, i16* %inp.addr.123.us, align 2 %add.us = add i16 %0, %sum0.122.us %add8.us = add i16 %add.us, %1 %inc.us = add nuw nsw i32 %z.025.us, 1 %exitcond = icmp eq i32 %inc.us, %c2 br i1 %exitcond, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us for.cond1.for.cond.cleanup3_crit_edge.us: ; preds = %for.body4.us %scevgep35 = getelementptr i16, i16* %inp.addr.031.us, i32 %c2 %inc11.us = add nuw nsw i32 %l.030.us, 1 %exitcond36 = icmp eq i32 %inc11.us, %c1 br i1 %exitcond36, label %for.cond.cleanup, label %for.cond1.preheader.us for.cond.cleanup: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %entry %sum0.0.lcssa = phi i16 [ 0, %entry ], [ %add8.us, %for.cond1.for.cond.cleanup3_crit_edge.us ] ret i16 %sum0.0.lcssa }
llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll +22 −22 Original line number Diff line number Diff line Loading @@ -1778,11 +1778,11 @@ for.body: ; preds = %for.body, %for.body define arm_aapcs_vfpcc float @half_short_mac(half* nocapture readonly %a, i16* nocapture readonly %b, i32 %N) { ; CHECK-LABEL: half_short_mac: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: push {r4, r5, r6, r7, lr} ; CHECK-NEXT: push {r4, r5, r6, lr} ; CHECK-NEXT: cbz r2, .LBB11_3 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader ; CHECK-NEXT: subs r3, r2, #1 ; CHECK-NEXT: and r7, r2, #3 ; CHECK-NEXT: and r6, r2, #3 ; CHECK-NEXT: cmp r3, #3 ; CHECK-NEXT: bhs .LBB11_4 ; CHECK-NEXT: @ %bb.2: Loading @@ -1799,33 +1799,33 @@ define arm_aapcs_vfpcc float @half_short_mac(half* nocapture readonly %a, i16* n ; CHECK-NEXT: vldr s0, .LCPI11_0 ; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: add.w lr, r3, r2, lsr #2 ; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: adds r3, r1, #4 ; CHECK-NEXT: adds r2, r0, #4 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB11_5: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: adds r2, r1, r3 ; CHECK-NEXT: adds r6, r0, r3 ; CHECK-NEXT: vldr.16 s2, [r6, #6] ; CHECK-NEXT: ldrsh.w r4, [r3, #2] ; CHECK-NEXT: vldr.16 s2, [r2, #2] ; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: ldrsh.w r4, [r2, #2] ; CHECK-NEXT: ldrsh.w r5, [r2, #4] ; CHECK-NEXT: ldrsh.w r2, [r2, #6] ; CHECK-NEXT: vmov s8, r4 ; CHECK-NEXT: vmov s6, r5 ; CHECK-NEXT: vmov s4, r2 ; CHECK-NEXT: vmov s4, r4 ; CHECK-NEXT: vcvt.f16.s32 s4, s4 ; CHECK-NEXT: ldrsh.w r4, [r3] ; CHECK-NEXT: vmul.f16 s2, s2, s4 ; CHECK-NEXT: vldr.16 s4, [r6, #4] ; CHECK-NEXT: vldr.16 s4, [r2] ; CHECK-NEXT: vmov s6, r4 ; CHECK-NEXT: vcvt.f16.s32 s6, s6 ; CHECK-NEXT: ldrsh r5, [r3, #-2] ; CHECK-NEXT: ldrsh r4, [r3, #-4] ; CHECK-NEXT: vmul.f16 s4, s4, s6 ; CHECK-NEXT: vldr.16 s6, [r6, #2] ; CHECK-NEXT: vldr.16 s6, [r2, #-2] ; CHECK-NEXT: adds r3, #8 ; CHECK-NEXT: vmov s8, r5 ; CHECK-NEXT: vcvt.f16.s32 s8, s8 ; CHECK-NEXT: ldrsh r2, [r1, r3] ; CHECK-NEXT: vmov s10, r4 ; CHECK-NEXT: vmul.f16 s6, s6, s8 ; CHECK-NEXT: vldr.16 s8, [r6] ; CHECK-NEXT: adds r3, #8 ; CHECK-NEXT: vmov s10, r2 ; CHECK-NEXT: vldr.16 s8, [r2, #-4] ; CHECK-NEXT: vcvt.f16.s32 s10, s10 ; CHECK-NEXT: adds r2, #8 ; CHECK-NEXT: vmul.f16 s8, s8, s10 ; CHECK-NEXT: vcvtb.f32.f16 s8, s8 ; CHECK-NEXT: vcvtb.f32.f16 s6, s6 Loading @@ -1837,11 +1837,11 @@ define arm_aapcs_vfpcc float @half_short_mac(half* nocapture readonly %a, i16* n ; CHECK-NEXT: vadd.f32 s0, s0, s2 ; CHECK-NEXT: le lr, .LBB11_5 ; CHECK-NEXT: .LBB11_6: @ %for.cond.cleanup.loopexit.unr-lcssa ; CHECK-NEXT: wls lr, r7, .LBB11_9 ; CHECK-NEXT: wls lr, r6, .LBB11_9 ; CHECK-NEXT: @ %bb.7: @ %for.body.epil.preheader ; CHECK-NEXT: add.w r0, r0, r12, lsl #1 ; CHECK-NEXT: add.w r1, r1, r12, lsl #1 ; CHECK-NEXT: mov lr, r7 ; CHECK-NEXT: mov lr, r6 ; CHECK-NEXT: .LBB11_8: @ %for.body.epil ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrsh r2, [r1], #2 Loading @@ -1854,7 +1854,7 @@ define arm_aapcs_vfpcc float @half_short_mac(half* nocapture readonly %a, i16* n ; CHECK-NEXT: vadd.f32 s0, s0, s2 ; CHECK-NEXT: le lr, .LBB11_8 ; CHECK-NEXT: .LBB11_9: @ %for.cond.cleanup ; CHECK-NEXT: pop {r4, r5, r6, r7, pc} ; CHECK-NEXT: pop {r4, r5, r6, pc} ; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: @ %bb.10: ; CHECK-NEXT: .LCPI11_0: Loading
llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll +94 −90 Original line number Diff line number Diff line Loading @@ -372,28 +372,28 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_char(i8* nocapture readonly ; CHECK-NEXT: cmp.w r12, #0 ; CHECK-NEXT: beq.w .LBB5_11 ; CHECK-NEXT: @ %bb.1: @ %for.body.lr.ph ; CHECK-NEXT: add.w r4, r3, r12, lsl #2 ; CHECK-NEXT: add.w r5, r1, r12 ; CHECK-NEXT: cmp r4, r1 ; CHECK-NEXT: add.w r6, r0, r12 ; CHECK-NEXT: cset r7, hi ; CHECK-NEXT: cmp r5, r3 ; CHECK-NEXT: cset r5, hi ; CHECK-NEXT: cmp r4, r0 ; CHECK-NEXT: add.w r6, r3, r12, lsl #2 ; CHECK-NEXT: add.w r4, r1, r12 ; CHECK-NEXT: cmp r6, r1 ; CHECK-NEXT: add.w r5, r0, r12 ; CHECK-NEXT: cset lr, hi ; CHECK-NEXT: cmp r4, r3 ; CHECK-NEXT: cset r4, hi ; CHECK-NEXT: cmp r6, r3 ; CHECK-NEXT: cmp r6, r0 ; CHECK-NEXT: cset r6, hi ; CHECK-NEXT: mov.w lr, #1 ; CHECK-NEXT: ands r6, r4 ; CHECK-NEXT: lsls r6, r6, #31 ; CHECK-NEXT: cmp r5, r3 ; CHECK-NEXT: cset r5, hi ; CHECK-NEXT: ands r5, r6 ; CHECK-NEXT: movs r6, #1 ; CHECK-NEXT: lsls r5, r5, #31 ; CHECK-NEXT: itt eq ; CHECK-NEXT: andeq.w r4, r5, r7 ; CHECK-NEXT: lslseq.w r4, r4, #31 ; CHECK-NEXT: andeq.w r5, r4, lr ; CHECK-NEXT: lslseq.w r5, r5, #31 ; CHECK-NEXT: beq .LBB5_4 ; CHECK-NEXT: @ %bb.2: @ %for.body.preheader ; CHECK-NEXT: sub.w r4, r12, #1 ; CHECK-NEXT: sub.w r5, r12, #1 ; CHECK-NEXT: and r9, r12, #3 ; CHECK-NEXT: cmp r4, #3 ; CHECK-NEXT: cmp r5, #3 ; CHECK-NEXT: bhs .LBB5_6 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: mov.w r12, #0 Loading @@ -409,33 +409,35 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_char(i8* nocapture readonly ; CHECK-NEXT: letp lr, .LBB5_5 ; CHECK-NEXT: b .LBB5_11 ; CHECK-NEXT: .LBB5_6: @ %for.body.preheader.new ; CHECK-NEXT: bic r7, r12, #3 ; CHECK-NEXT: bic r5, r12, #3 ; CHECK-NEXT: add.w r4, r3, #8 ; CHECK-NEXT: subs r7, #4 ; CHECK-NEXT: subs r5, #4 ; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: add.w lr, lr, r7, lsr #2 ; CHECK-NEXT: add.w lr, r6, r5, lsr #2 ; CHECK-NEXT: adds r5, r0, #3 ; CHECK-NEXT: adds r6, r1, #1 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB5_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrb.w r5, [r0, r12] ; CHECK-NEXT: add.w r7, r1, r12 ; CHECK-NEXT: ldrb.w r6, [r1, r12] ; CHECK-NEXT: smlabb r5, r6, r5, r2 ; CHECK-NEXT: str r5, [r4, #-8] ; CHECK-NEXT: add.w r5, r0, r12 ; CHECK-NEXT: ldrb r6, [r7, #1] ; CHECK-NEXT: ldrb r8, [r5, #-3] ; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: ldrb.w r8, [r5, #1] ; CHECK-NEXT: smlabb r6, r6, r8, r2 ; CHECK-NEXT: str r6, [r4, #-4] ; CHECK-NEXT: ldrb.w r8, [r5, #2] ; CHECK-NEXT: ldrb r6, [r7, #2] ; CHECK-NEXT: smlabb r6, r6, r8, r2 ; CHECK-NEXT: str r6, [r4] ; CHECK-NEXT: ldrb r5, [r5, #3] ; CHECK-NEXT: ldrb r6, [r7, #3] ; CHECK-NEXT: smlabb r5, r6, r5, r2 ; CHECK-NEXT: str r5, [r4, #4] ; CHECK-NEXT: ldrb r7, [r6, #-1] ; CHECK-NEXT: smlabb r7, r7, r8, r2 ; CHECK-NEXT: str r7, [r4, #-8] ; CHECK-NEXT: ldrb r8, [r5, #-2] ; CHECK-NEXT: ldrb r7, [r6] ; CHECK-NEXT: smlabb r7, r7, r8, r2 ; CHECK-NEXT: str r7, [r4, #-4] ; CHECK-NEXT: ldrb r8, [r5, #-1] ; CHECK-NEXT: ldrb r7, [r6, #1] ; CHECK-NEXT: smlabb r7, r7, r8, r2 ; CHECK-NEXT: str r7, [r4] ; CHECK-NEXT: ldrb.w r8, [r5] ; CHECK-NEXT: adds r5, #4 ; CHECK-NEXT: ldrb r7, [r6, #2] ; CHECK-NEXT: adds r6, #4 ; CHECK-NEXT: smlabb r7, r7, r8, r2 ; CHECK-NEXT: str r7, [r4, #4] ; CHECK-NEXT: adds r4, #16 ; CHECK-NEXT: le lr, .LBB5_7 ; CHECK-NEXT: .LBB5_8: @ %for.cond.cleanup.loopexit.unr-lcssa Loading @@ -447,10 +449,10 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_char(i8* nocapture readonly ; CHECK-NEXT: mov lr, r9 ; CHECK-NEXT: .LBB5_10: @ %for.body.epil ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrb r7, [r0], #1 ; CHECK-NEXT: ldrb r6, [r1], #1 ; CHECK-NEXT: smlabb r7, r6, r7, r2 ; CHECK-NEXT: str r7, [r3], #4 ; CHECK-NEXT: ldrb r6, [r0], #1 ; CHECK-NEXT: ldrb r5, [r1], #1 ; CHECK-NEXT: smlabb r6, r5, r6, r2 ; CHECK-NEXT: str r6, [r3], #4 ; CHECK-NEXT: le lr, .LBB5_10 ; CHECK-NEXT: .LBB5_11: @ %for.cond.cleanup ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} Loading Loading @@ -663,28 +665,28 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_uchar(i8* nocapture readonl ; CHECK-NEXT: cmp.w r12, #0 ; CHECK-NEXT: beq.w .LBB7_11 ; CHECK-NEXT: @ %bb.1: @ %for.body.lr.ph ; CHECK-NEXT: add.w r4, r3, r12, lsl #2 ; CHECK-NEXT: add.w r5, r1, r12 ; CHECK-NEXT: cmp r4, r1 ; CHECK-NEXT: add.w r6, r0, r12 ; CHECK-NEXT: cset r7, hi ; CHECK-NEXT: cmp r5, r3 ; CHECK-NEXT: cset r5, hi ; CHECK-NEXT: cmp r4, r0 ; CHECK-NEXT: add.w r6, r3, r12, lsl #2 ; CHECK-NEXT: add.w r4, r1, r12 ; CHECK-NEXT: cmp r6, r1 ; CHECK-NEXT: add.w r5, r0, r12 ; CHECK-NEXT: cset lr, hi ; CHECK-NEXT: cmp r4, r3 ; CHECK-NEXT: cset r4, hi ; CHECK-NEXT: cmp r6, r3 ; CHECK-NEXT: cmp r6, r0 ; CHECK-NEXT: cset r6, hi ; CHECK-NEXT: mov.w lr, #1 ; CHECK-NEXT: ands r6, r4 ; CHECK-NEXT: lsls r6, r6, #31 ; CHECK-NEXT: cmp r5, r3 ; CHECK-NEXT: cset r5, hi ; CHECK-NEXT: ands r5, r6 ; CHECK-NEXT: movs r6, #1 ; CHECK-NEXT: lsls r5, r5, #31 ; CHECK-NEXT: itt eq ; CHECK-NEXT: andeq.w r4, r5, r7 ; CHECK-NEXT: lslseq.w r4, r4, #31 ; CHECK-NEXT: andeq.w r5, r4, lr ; CHECK-NEXT: lslseq.w r5, r5, #31 ; CHECK-NEXT: beq .LBB7_4 ; CHECK-NEXT: @ %bb.2: @ %for.body.preheader ; CHECK-NEXT: sub.w r4, r12, #1 ; CHECK-NEXT: sub.w r5, r12, #1 ; CHECK-NEXT: and r9, r12, #3 ; CHECK-NEXT: cmp r4, #3 ; CHECK-NEXT: cmp r5, #3 ; CHECK-NEXT: bhs .LBB7_6 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: mov.w r12, #0 Loading @@ -700,33 +702,35 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_uchar(i8* nocapture readonl ; CHECK-NEXT: letp lr, .LBB7_5 ; CHECK-NEXT: b .LBB7_11 ; CHECK-NEXT: .LBB7_6: @ %for.body.preheader.new ; CHECK-NEXT: bic r7, r12, #3 ; CHECK-NEXT: bic r5, r12, #3 ; CHECK-NEXT: add.w r4, r3, #8 ; CHECK-NEXT: subs r7, #4 ; CHECK-NEXT: subs r5, #4 ; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: add.w lr, lr, r7, lsr #2 ; CHECK-NEXT: add.w lr, r6, r5, lsr #2 ; CHECK-NEXT: adds r5, r0, #3 ; CHECK-NEXT: adds r6, r1, #1 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB7_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrb.w r5, [r0, r12] ; CHECK-NEXT: add.w r7, r1, r12 ; CHECK-NEXT: ldrb.w r6, [r1, r12] ; CHECK-NEXT: smlabb r5, r6, r5, r2 ; CHECK-NEXT: str r5, [r4, #-8] ; CHECK-NEXT: add.w r5, r0, r12 ; CHECK-NEXT: ldrb r6, [r7, #1] ; CHECK-NEXT: ldrb r8, [r5, #-3] ; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: ldrb.w r8, [r5, #1] ; CHECK-NEXT: smlabb r6, r6, r8, r2 ; CHECK-NEXT: str r6, [r4, #-4] ; CHECK-NEXT: ldrb.w r8, [r5, #2] ; CHECK-NEXT: ldrb r6, [r7, #2] ; CHECK-NEXT: smlabb r6, r6, r8, r2 ; CHECK-NEXT: str r6, [r4] ; CHECK-NEXT: ldrb r5, [r5, #3] ; CHECK-NEXT: ldrb r6, [r7, #3] ; CHECK-NEXT: smlabb r5, r6, r5, r2 ; CHECK-NEXT: str r5, [r4, #4] ; CHECK-NEXT: ldrb r7, [r6, #-1] ; CHECK-NEXT: smlabb r7, r7, r8, r2 ; CHECK-NEXT: str r7, [r4, #-8] ; CHECK-NEXT: ldrb r8, [r5, #-2] ; CHECK-NEXT: ldrb r7, [r6] ; CHECK-NEXT: smlabb r7, r7, r8, r2 ; CHECK-NEXT: str r7, [r4, #-4] ; CHECK-NEXT: ldrb r8, [r5, #-1] ; CHECK-NEXT: ldrb r7, [r6, #1] ; CHECK-NEXT: smlabb r7, r7, r8, r2 ; CHECK-NEXT: str r7, [r4] ; CHECK-NEXT: ldrb.w r8, [r5] ; CHECK-NEXT: adds r5, #4 ; CHECK-NEXT: ldrb r7, [r6, #2] ; CHECK-NEXT: adds r6, #4 ; CHECK-NEXT: smlabb r7, r7, r8, r2 ; CHECK-NEXT: str r7, [r4, #4] ; CHECK-NEXT: adds r4, #16 ; CHECK-NEXT: le lr, .LBB7_7 ; CHECK-NEXT: .LBB7_8: @ %for.cond.cleanup.loopexit.unr-lcssa Loading @@ -738,10 +742,10 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_uchar(i8* nocapture readonl ; CHECK-NEXT: mov lr, r9 ; CHECK-NEXT: .LBB7_10: @ %for.body.epil ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrb r7, [r0], #1 ; CHECK-NEXT: ldrb r6, [r1], #1 ; CHECK-NEXT: smlabb r7, r6, r7, r2 ; CHECK-NEXT: str r7, [r3], #4 ; CHECK-NEXT: ldrb r6, [r0], #1 ; CHECK-NEXT: ldrb r5, [r1], #1 ; CHECK-NEXT: smlabb r6, r5, r6, r2 ; CHECK-NEXT: str r6, [r3], #4 ; CHECK-NEXT: le lr, .LBB7_10 ; CHECK-NEXT: .LBB7_11: @ %for.cond.cleanup ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} Loading