Commit 8cba99e2 authored by Sjoerd Meijer's avatar Sjoerd Meijer
Browse files

[ARM][MVE] Tail-Predication: rematerialise iteration count in exit blocks

This patch uses helper function rewriteLoopExitValues that is refactored in
D72602 to rematerialise the iteration count in exit blocks, so that we can
clean-up loop update expressions inside the hardware-loops later in
ARMLowOverheadLoops, which is necessary to get actual performance gains for
tail-predicated loops.

Differential Revision: https://reviews.llvm.org/D72714
parent cfe97681
Loading
Loading
Loading
Loading
+45 −4
Original line number Diff line number Diff line
@@ -35,12 +35,14 @@
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/InitializePasses.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicsARM.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/Support/Debug.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/LoopUtils.h"

using namespace llvm;

@@ -56,8 +58,13 @@ namespace {
class MVETailPredication : public LoopPass {
  SmallVector<IntrinsicInst*, 4> MaskedInsts;
  Loop *L = nullptr;
  LoopInfo *LI = nullptr;
  const DataLayout *DL;
  DominatorTree *DT = nullptr;
  ScalarEvolution *SE = nullptr;
  TargetTransformInfo *TTI = nullptr;
  TargetLibraryInfo *TLI = nullptr;
  bool ClonedVCTPInExitBlock = false;

public:
  static char ID;
@@ -69,6 +76,8 @@ public:
    AU.addRequired<LoopInfoWrapperPass>();
    AU.addRequired<TargetPassConfig>();
    AU.addRequired<TargetTransformInfoWrapperPass>();
    AU.addRequired<DominatorTreeWrapperPass>();
    AU.addRequired<TargetLibraryInfoWrapperPass>();
    AU.addPreserved<LoopInfoWrapperPass>();
    AU.setPreservesCFG();
  }
@@ -97,6 +106,11 @@ private:
                           DenseMap<Instruction*, Instruction*> &NewPredicates,
                           VectorType *VecTy,
                           Value *NumElements);

  /// Rematerialize the iteration count in exit blocks, which enables
  /// ARMLowOverheadLoops to better optimise away loop update statements inside
  /// hardware-loops.
  void RematerializeIterCount();
};

} // end namespace
@@ -120,6 +134,16 @@ static bool IsMasked(Instruction *I) {
  return ID == Intrinsic::masked_store || ID == Intrinsic::masked_load;
}

void MVETailPredication::RematerializeIterCount() {
  SmallVector<WeakTrackingVH, 16> DeadInsts;
  SCEVExpander Rewriter(*SE, *DL, "mvetp");
  ReplaceExitVal ReplaceExitValue = AlwaysRepl;

  formLCSSARecursively(*L, *DT, LI, SE);
  rewriteLoopExitValues(L, LI, TLI, SE, Rewriter, DT, ReplaceExitValue,
                        DeadInsts);
}

bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) {
  if (skipLoop(L) || DisableTailPredication)
    return false;
@@ -128,8 +152,13 @@ bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) {
  auto &TPC = getAnalysis<TargetPassConfig>();
  auto &TM = TPC.getTM<TargetMachine>();
  auto *ST = &TM.getSubtarget<ARMSubtarget>(F);
  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
  TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
  auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
  TLI = TLIP ? &TLIP->getTLI(*L->getHeader()->getParent()) : nullptr;
  DL = &L->getHeader()->getModule()->getDataLayout();
  this->L = L;

  // The MVE and LOB extensions are combined to enable tail-predication, but
@@ -185,7 +214,14 @@ bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) {

  LLVM_DEBUG(dbgs() << "ARM TP: Running on Loop: " << *L << *Setup << "\n"
             << *Decrement << "\n");
  return TryConvert(Setup->getArgOperand(0));

  if (TryConvert(Setup->getArgOperand(0))) {
    if (ClonedVCTPInExitBlock)
      RematerializeIterCount();
    return true;
  }

  return false;
}

bool MVETailPredication::isTailPredicate(Instruction *I, Value *NumElements) {
@@ -407,14 +443,16 @@ Value* MVETailPredication::ComputeElements(Value *TripCount,
// in the block. This means that the VPR doesn't have to be live into the
// exit block which should make it easier to convert this loop into a proper
// tail predicated loop.
static void Cleanup(DenseMap<Instruction*, Instruction*> &NewPredicates,
static bool Cleanup(DenseMap<Instruction*, Instruction*> &NewPredicates,
                    SetVector<Instruction*> &MaybeDead, Loop *L) {
  BasicBlock *Exit = L->getUniqueExitBlock();
  if (!Exit) {
    LLVM_DEBUG(dbgs() << "ARM TP: can't find loop exit block\n");
    return;
    return false;
  }

  bool ClonedVCTPInExitBlock = false;

  for (auto &Pair : NewPredicates) {
    Instruction *OldPred = Pair.first;
    Instruction *NewPred = Pair.second;
@@ -425,6 +463,7 @@ static void Cleanup(DenseMap<Instruction*, Instruction*> &NewPredicates,
        PredClone->insertBefore(&I);
        I.replaceAllUsesWith(PredClone);
        MaybeDead.insert(&I);
        ClonedVCTPInExitBlock = true;
        LLVM_DEBUG(dbgs() << "ARM TP: replacing: "; I.dump();
                   dbgs() << "ARM TP: with:      "; PredClone->dump());
        break;
@@ -455,6 +494,8 @@ static void Cleanup(DenseMap<Instruction*, Instruction*> &NewPredicates,

  for (auto I : L->blocks())
    DeleteDeadPHIs(I);

  return ClonedVCTPInExitBlock;
}

void MVETailPredication::InsertVCTPIntrinsic(Instruction *Predicate,
@@ -538,7 +579,7 @@ bool MVETailPredication::TryConvert(Value *TripCount) {
  }

  // Now clean up.
  Cleanup(NewPredicates, Predicates, L);
  ClonedVCTPInExitBlock = Cleanup(NewPredicates, Predicates, L);
  return true;
}

+55 −48
Original line number Diff line number Diff line
@@ -8,7 +8,7 @@ define dso_local i32 @vpsel_mul_reduce_add(i32* noalias nocapture readonly %a, i
; CHECK-NEXT:    itt eq
; CHECK-NEXT:    moveq r0, #0
; CHECK-NEXT:    bxeq lr
; CHECK-NEXT:    push {r4, lr}
; CHECK-NEXT:    push {r4, r5, r7, lr}
; CHECK-NEXT:    sub sp, #4
; CHECK-NEXT:    adds r4, r3, #3
; CHECK-NEXT:    vmov.i32 q1, #0x0
@@ -16,35 +16,36 @@ define dso_local i32 @vpsel_mul_reduce_add(i32* noalias nocapture readonly %a, i
; CHECK-NEXT:    sub.w r12, r4, #4
; CHECK-NEXT:    movs r4, #1
; CHECK-NEXT:    add.w lr, r4, r12, lsr #2
; CHECK-NEXT:    mov.w r12, #0
; CHECK-NEXT:    lsr.w r4, r12, #2
; CHECK-NEXT:    sub.w r12, r3, r4, lsl #2
; CHECK-NEXT:    movs r4, #0
; CHECK-NEXT:    dls lr, lr
; CHECK-NEXT:  .LBB0_1: @ %vector.body
; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vctp.32 r3
; CHECK-NEXT:    mov r4, r3
; CHECK-NEXT:    and r3, r12, #15
; CHECK-NEXT:    and r5, r4, #15
; CHECK-NEXT:    vstr p0, [sp] @ 4-byte Spill
; CHECK-NEXT:    vdup.32 q3, r3
; CHECK-NEXT:    vdup.32 q3, r5
; CHECK-NEXT:    vmov q0, q1
; CHECK-NEXT:    vpstt
; CHECK-NEXT:    vldrwt.u32 q1, [r2], #16
; CHECK-NEXT:    vldrwt.u32 q2, [r1], #16
; CHECK-NEXT:    vcmp.i32 eq, q3, zr
; CHECK-NEXT:    adds r4, #4
; CHECK-NEXT:    vpsel q1, q2, q1
; CHECK-NEXT:    vldr p0, [sp] @ 4-byte Reload
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vldrwt.u32 q2, [r0], #16
; CHECK-NEXT:    vmul.i32 q1, q1, q2
; CHECK-NEXT:    add.w r12, r12, #4
; CHECK-NEXT:    subs r3, r4, #4
; CHECK-NEXT:    subs r3, #4
; CHECK-NEXT:    vadd.i32 q1, q1, q0
; CHECK-NEXT:    le lr, .LBB0_1
; CHECK-NEXT:  @ %bb.2: @ %middle.block
; CHECK-NEXT:    vctp.32 r4
; CHECK-NEXT:    vctp.32 r12
; CHECK-NEXT:    vpsel q0, q1, q0
; CHECK-NEXT:    vaddv.u32 r0, q0
; CHECK-NEXT:    add sp, #4
; CHECK-NEXT:    pop {r4, pc}
; CHECK-NEXT:    pop {r4, r5, r7, pc}
entry:
  %cmp8 = icmp eq i32 %N, 0
  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
@@ -97,42 +98,43 @@ for.cond.cleanup: ; preds = %middle.block, %entr
define dso_local i32 @vpsel_mul_reduce_add_2(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b,
; CHECK-LABEL: vpsel_mul_reduce_add_2:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    push {r4, r5, r7, lr}
; CHECK-NEXT:    push {r4, r5, r6, lr}
; CHECK-NEXT:    sub sp, #4
; CHECK-NEXT:    ldr r5, [sp, #20]
; CHECK-NEXT:    cmp r5, #0
; CHECK-NEXT:    ldr.w r12, [sp, #20]
; CHECK-NEXT:    cmp.w r12, #0
; CHECK-NEXT:    beq .LBB1_4
; CHECK-NEXT:  @ %bb.1: @ %vector.ph
; CHECK-NEXT:    adds r4, r5, #3
; CHECK-NEXT:    add.w r5, r12, #3
; CHECK-NEXT:    vmov.i32 q1, #0x0
; CHECK-NEXT:    bic r4, r4, #3
; CHECK-NEXT:    sub.w r12, r4, #4
; CHECK-NEXT:    movs r4, #1
; CHECK-NEXT:    add.w lr, r4, r12, lsr #2
; CHECK-NEXT:    mov.w r12, #0
; CHECK-NEXT:    bic r5, r5, #3
; CHECK-NEXT:    subs r4, r5, #4
; CHECK-NEXT:    movs r5, #1
; CHECK-NEXT:    add.w lr, r5, r4, lsr #2
; CHECK-NEXT:    lsrs r4, r4, #2
; CHECK-NEXT:    sub.w r4, r12, r4, lsl #2
; CHECK-NEXT:    movs r5, #0
; CHECK-NEXT:    dls lr, lr
; CHECK-NEXT:  .LBB1_2: @ %vector.body
; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vctp.32 r5
; CHECK-NEXT:    mov r4, r5
; CHECK-NEXT:    and r5, r12, #15
; CHECK-NEXT:    vctp.32 r12
; CHECK-NEXT:    and r6, r5, #15
; CHECK-NEXT:    vstr p0, [sp] @ 4-byte Spill
; CHECK-NEXT:    vmov q0, q1
; CHECK-NEXT:    vpstt
; CHECK-NEXT:    vldrwt.u32 q1, [r3], #16
; CHECK-NEXT:    vldrwt.u32 q2, [r2], #16
; CHECK-NEXT:    vdup.32 q3, r5
; CHECK-NEXT:    vdup.32 q3, r6
; CHECK-NEXT:    vsub.i32 q1, q2, q1
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vldrwt.u32 q2, [r1], #16
; CHECK-NEXT:    vcmp.i32 eq, q3, zr
; CHECK-NEXT:    adds r5, #4
; CHECK-NEXT:    vpsel q1, q1, q2
; CHECK-NEXT:    vldr p0, [sp] @ 4-byte Reload
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vldrwt.u32 q2, [r0], #16
; CHECK-NEXT:    vmul.i32 q1, q1, q2
; CHECK-NEXT:    add.w r12, r12, #4
; CHECK-NEXT:    subs r5, r4, #4
; CHECK-NEXT:    sub.w r12, r12, #4
; CHECK-NEXT:    vadd.i32 q1, q1, q0
; CHECK-NEXT:    le lr, .LBB1_2
; CHECK-NEXT:  @ %bb.3: @ %middle.block
@@ -140,11 +142,11 @@ define dso_local i32 @vpsel_mul_reduce_add_2(i32* noalias nocapture readonly %a,
; CHECK-NEXT:    vpsel q0, q1, q0
; CHECK-NEXT:    vaddv.u32 r0, q0
; CHECK-NEXT:    add sp, #4
; CHECK-NEXT:    pop {r4, r5, r7, pc}
; CHECK-NEXT:    pop {r4, r5, r6, pc}
; CHECK-NEXT:  .LBB1_4:
; CHECK-NEXT:    movs r0, #0
; CHECK-NEXT:    add sp, #4
; CHECK-NEXT:    pop {r4, r5, r7, pc}
; CHECK-NEXT:    pop {r4, r5, r6, pc}
                                         i32* noalias nocapture readonly %c, i32* noalias nocapture readonly %d, i32 %N) {
entry:
  %cmp8 = icmp eq i32 %N, 0
@@ -203,19 +205,23 @@ define dso_local i32 @and_mul_reduce_add(i32* noalias nocapture readonly %a, i32
; CHECK-LABEL: and_mul_reduce_add:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    push {r4, r5, r7, lr}
; CHECK-NEXT:    ldr r5, [sp, #16]
; CHECK-NEXT:    cbz r5, .LBB2_4
; CHECK-NEXT:    ldr.w r12, [sp, #16]
; CHECK-NEXT:    cmp.w r12, #0
; CHECK-NEXT:    beq .LBB2_4
; CHECK-NEXT:  @ %bb.1: @ %vector.ph
; CHECK-NEXT:    add.w r4, r12, #3
; CHECK-NEXT:    vmov.i32 q1, #0x0
; CHECK-NEXT:    dlstp.32 lr, r5
; CHECK-NEXT:    bic r4, r4, #3
; CHECK-NEXT:    subs r5, r4, #4
; CHECK-NEXT:    lsrs r4, r5, #2
; CHECK-NEXT:    sub.w r4, r12, r4, lsl #2
; CHECK-NEXT:    dlstp.32 lr, r12
; CHECK-NEXT:  .LBB2_2: @ %vector.body
; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmov q0, q1
; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
; CHECK-NEXT:    vldrw.u32 q2, [r0], #16
; CHECK-NEXT:    mov r12, r5
; CHECK-NEXT:    vsub.i32 q1, q2, q1
; CHECK-NEXT:    subs r5, #4
; CHECK-NEXT:    vcmp.i32 eq, q1, zr
; CHECK-NEXT:    vpstt
; CHECK-NEXT:    vldrwt.u32 q1, [r3], #16
@@ -224,7 +230,7 @@ define dso_local i32 @and_mul_reduce_add(i32* noalias nocapture readonly %a, i32
; CHECK-NEXT:    vadd.i32 q1, q1, q0
; CHECK-NEXT:    letp lr, .LBB2_2
; CHECK-NEXT:  @ %bb.3: @ %middle.block
; CHECK-NEXT:    vctp.32 r12
; CHECK-NEXT:    vctp.32 r4
; CHECK-NEXT:    vpsel q0, q1, q0
; CHECK-NEXT:    vaddv.u32 r0, q0
; CHECK-NEXT:    pop {r4, r5, r7, pc}
@@ -285,36 +291,37 @@ for.cond.cleanup: ; preds = %middle.block, %entr
define dso_local i32 @or_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c, i32* noalias nocapture readonly %d, i32 %N) {
; CHECK-LABEL: or_mul_reduce_add:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    push {r4, r5, r7, lr}
; CHECK-NEXT:    push {r4, r5, r6, lr}
; CHECK-NEXT:    sub sp, #4
; CHECK-NEXT:    ldr r5, [sp, #20]
; CHECK-NEXT:    cmp r5, #0
; CHECK-NEXT:    ldr.w r12, [sp, #20]
; CHECK-NEXT:    cmp.w r12, #0
; CHECK-NEXT:    beq .LBB3_4
; CHECK-NEXT:  @ %bb.1: @ %vector.ph
; CHECK-NEXT:    adds r4, r5, #3
; CHECK-NEXT:    add.w r4, r12, #3
; CHECK-NEXT:    vmov.i32 q1, #0x0
; CHECK-NEXT:    bic r4, r4, #3
; CHECK-NEXT:    sub.w r12, r4, #4
; CHECK-NEXT:    subs r5, r4, #4
; CHECK-NEXT:    movs r4, #1
; CHECK-NEXT:    add.w lr, r4, r12, lsr #2
; CHECK-NEXT:    add.w lr, r4, r5, lsr #2
; CHECK-NEXT:    lsrs r4, r5, #2
; CHECK-NEXT:    sub.w r4, r12, r4, lsl #2
; CHECK-NEXT:    dls lr, lr
; CHECK-NEXT:  .LBB3_2: @ %vector.body
; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vctp.32 r5
; CHECK-NEXT:    vctp.32 r12
; CHECK-NEXT:    vmov q0, q1
; CHECK-NEXT:    vstr p0, [sp] @ 4-byte Spill
; CHECK-NEXT:    mov r12, r5
; CHECK-NEXT:    sub.w r12, r12, #4
; CHECK-NEXT:    vpstt
; CHECK-NEXT:    vldrwt.u32 q1, [r1], #16
; CHECK-NEXT:    vldrwt.u32 q2, [r0], #16
; CHECK-NEXT:    vsub.i32 q1, q2, q1
; CHECK-NEXT:    vcmp.i32 eq, q1, zr
; CHECK-NEXT:    vmrs r4, p0
; CHECK-NEXT:    vldr p0, [sp] @ 4-byte Reload
; CHECK-NEXT:    vmrs r5, p0
; CHECK-NEXT:    orrs r4, r5
; CHECK-NEXT:    sub.w r5, r12, #4
; CHECK-NEXT:    vmsr p0, r4
; CHECK-NEXT:    vldr p0, [sp] @ 4-byte Reload
; CHECK-NEXT:    vmrs r6, p0
; CHECK-NEXT:    orrs r5, r6
; CHECK-NEXT:    vmsr p0, r5
; CHECK-NEXT:    vpstt
; CHECK-NEXT:    vldrwt.u32 q1, [r3], #16
; CHECK-NEXT:    vldrwt.u32 q2, [r2], #16
@@ -322,15 +329,15 @@ define dso_local i32 @or_mul_reduce_add(i32* noalias nocapture readonly %a, i32*
; CHECK-NEXT:    vadd.i32 q1, q1, q0
; CHECK-NEXT:    le lr, .LBB3_2
; CHECK-NEXT:  @ %bb.3: @ %middle.block
; CHECK-NEXT:    vctp.32 r12
; CHECK-NEXT:    vctp.32 r4
; CHECK-NEXT:    vpsel q0, q1, q0
; CHECK-NEXT:    vaddv.u32 r0, q0
; CHECK-NEXT:    add sp, #4
; CHECK-NEXT:    pop {r4, r5, r7, pc}
; CHECK-NEXT:    pop {r4, r5, r6, pc}
; CHECK-NEXT:  .LBB3_4:
; CHECK-NEXT:    movs r0, #0
; CHECK-NEXT:    add sp, #4
; CHECK-NEXT:    pop {r4, r5, r7, pc}
; CHECK-NEXT:    pop {r4, r5, r6, pc}
entry:
  %cmp8 = icmp eq i32 %N, 0
  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
+2 −1
Original line number Diff line number Diff line
@@ -224,11 +224,12 @@ define arm_aapcs_vfpcc float @fast_float_mac(float* nocapture readonly %b, float
; CHECK-NEXT:    sub.w r12, r3, #4
; CHECK-NEXT:    movs r3, #1
; CHECK-NEXT:    add.w lr, r3, r12, lsr #2
; CHECK-NEXT:    lsr.w r3, r12, #2
; CHECK-NEXT:    sub.w r3, r2, r3, lsl #2
; CHECK-NEXT:    dls lr, lr
; CHECK-NEXT:  .LBB1_2: @ %vector.body
; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vctp.32 r2
; CHECK-NEXT:    mov r3, r2
; CHECK-NEXT:    subs r2, #4
; CHECK-NEXT:    vmov q1, q0
; CHECK-NEXT:    vpstt
+25 −10
Original line number Diff line number Diff line
@@ -9,12 +9,15 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_char(i8 zeroext %a, i8* nocapture re
; CHECK-NEXT:    moveq r0, #0
; CHECK-NEXT:    bxeq lr
; CHECK-NEXT:    push {r7, lr}
; CHECK-NEXT:    adds r3, r2, #3
; CHECK-NEXT:    vmov.i32 q0, #0x0
; CHECK-NEXT:    bic r3, r3, #3
; CHECK-NEXT:    sub.w r12, r3, #4
; CHECK-NEXT:    lsr.w r3, r12, #2
; CHECK-NEXT:    sub.w r3, r2, r3, lsl #2
; CHECK-NEXT:    dlstp.32 lr, r2
; CHECK-NEXT:  .LBB0_1: @ %vector.body
; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    mov r3, r2
; CHECK-NEXT:    subs r2, #4
; CHECK-NEXT:    vmov q1, q0
; CHECK-NEXT:    vldrb.u32 q2, [r1], #4
; CHECK-NEXT:    vmla.u32 q0, q2, r0
@@ -74,12 +77,15 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_short(i16 signext %a, i16* nocapture
; CHECK-NEXT:    moveq r0, #0
; CHECK-NEXT:    bxeq lr
; CHECK-NEXT:    push {r7, lr}
; CHECK-NEXT:    adds r3, r2, #3
; CHECK-NEXT:    vmov.i32 q0, #0x0
; CHECK-NEXT:    bic r3, r3, #3
; CHECK-NEXT:    sub.w r12, r3, #4
; CHECK-NEXT:    lsr.w r3, r12, #2
; CHECK-NEXT:    sub.w r3, r2, r3, lsl #2
; CHECK-NEXT:    dlstp.32 lr, r2
; CHECK-NEXT:  .LBB1_1: @ %vector.body
; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    mov r3, r2
; CHECK-NEXT:    subs r2, #4
; CHECK-NEXT:    vmov q1, q0
; CHECK-NEXT:    vldrh.s32 q2, [r1], #8
; CHECK-NEXT:    vmla.u32 q0, q2, r0
@@ -139,12 +145,15 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_uchar(i8 zeroext %a, i8* nocapture r
; CHECK-NEXT:    moveq r0, #0
; CHECK-NEXT:    bxeq lr
; CHECK-NEXT:    push {r7, lr}
; CHECK-NEXT:    adds r3, r2, #3
; CHECK-NEXT:    vmov.i32 q0, #0x0
; CHECK-NEXT:    bic r3, r3, #3
; CHECK-NEXT:    sub.w r12, r3, #4
; CHECK-NEXT:    lsr.w r3, r12, #2
; CHECK-NEXT:    sub.w r3, r2, r3, lsl #2
; CHECK-NEXT:    dlstp.32 lr, r2
; CHECK-NEXT:  .LBB2_1: @ %vector.body
; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    mov r3, r2
; CHECK-NEXT:    subs r2, #4
; CHECK-NEXT:    vmov q1, q0
; CHECK-NEXT:    vldrb.u32 q2, [r1], #4
; CHECK-NEXT:    vmla.u32 q0, q2, r0
@@ -204,12 +213,15 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_ushort(i16 signext %a, i16* nocaptur
; CHECK-NEXT:    moveq r0, #0
; CHECK-NEXT:    bxeq lr
; CHECK-NEXT:    push {r7, lr}
; CHECK-NEXT:    adds r3, r2, #3
; CHECK-NEXT:    vmov.i32 q0, #0x0
; CHECK-NEXT:    bic r3, r3, #3
; CHECK-NEXT:    sub.w r12, r3, #4
; CHECK-NEXT:    lsr.w r3, r12, #2
; CHECK-NEXT:    sub.w r3, r2, r3, lsl #2
; CHECK-NEXT:    dlstp.32 lr, r2
; CHECK-NEXT:  .LBB3_1: @ %vector.body
; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    mov r3, r2
; CHECK-NEXT:    subs r2, #4
; CHECK-NEXT:    vmov q1, q0
; CHECK-NEXT:    vldrh.u32 q2, [r1], #8
; CHECK-NEXT:    vmla.u32 q0, q2, r0
@@ -269,12 +281,15 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_int(i32 %a, i32* nocapture readonly
; CHECK-NEXT:    moveq r0, #0
; CHECK-NEXT:    bxeq lr
; CHECK-NEXT:    push {r7, lr}
; CHECK-NEXT:    adds r3, r2, #3
; CHECK-NEXT:    vmov.i32 q0, #0x0
; CHECK-NEXT:    bic r3, r3, #3
; CHECK-NEXT:    sub.w r12, r3, #4
; CHECK-NEXT:    lsr.w r3, r12, #2
; CHECK-NEXT:    sub.w r3, r2, r3, lsl #2
; CHECK-NEXT:    dlstp.32 lr, r2
; CHECK-NEXT:  .LBB4_1: @ %vector.body
; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    mov r3, r2
; CHECK-NEXT:    subs r2, #4
; CHECK-NEXT:    vmov q1, q0
; CHECK-NEXT:    vldrw.u32 q2, [r1], #16
; CHECK-NEXT:    vmla.u32 q0, q2, r0
+21 −12
Original line number Diff line number Diff line
@@ -9,21 +9,24 @@ define dso_local i32 @mul_reduce_add(i32* noalias nocapture readonly %a, i32* no
; CHECK-NEXT:    moveq r0, #0
; CHECK-NEXT:    bxeq lr
; CHECK-NEXT:    push {r7, lr}
; CHECK-NEXT:    vmov.i32 q1, #0x0
; CHECK-NEXT:    adds r3, r2, #3
; CHECK-NEXT:    vmov.i32 q0, #0x0
; CHECK-NEXT:    bic r3, r3, #3
; CHECK-NEXT:    sub.w r12, r3, #4
; CHECK-NEXT:    lsr.w r3, r12, #2
; CHECK-NEXT:    sub.w r3, r2, r3, lsl #2
; CHECK-NEXT:    dlstp.32 lr, r2
; CHECK-NEXT:  .LBB0_1: @ %vector.body
; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmov q0, q1
; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
; CHECK-NEXT:    vmov q1, q0
; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
; CHECK-NEXT:    vldrw.u32 q2, [r1], #16
; CHECK-NEXT:    mov r3, r2
; CHECK-NEXT:    vmul.i32 q1, q2, q1
; CHECK-NEXT:    subs r2, #4
; CHECK-NEXT:    vadd.i32 q1, q1, q0
; CHECK-NEXT:    vmul.i32 q0, q2, q0
; CHECK-NEXT:    vadd.i32 q0, q0, q1
; CHECK-NEXT:    letp lr, .LBB0_1
; CHECK-NEXT:  @ %bb.2: @ %middle.block
; CHECK-NEXT:    vctp.32 r3
; CHECK-NEXT:    vpsel q0, q1, q0
; CHECK-NEXT:    vpsel q0, q0, q1
; CHECK-NEXT:    vaddv.u32 r0, q0
; CHECK-NEXT:    pop {r7, pc}
entry:
@@ -75,14 +78,17 @@ define dso_local i32 @mul_reduce_add_const(i32* noalias nocapture readonly %a, i
; CHECK-NEXT:    moveq r0, #0
; CHECK-NEXT:    bxeq lr
; CHECK-NEXT:    push {r7, lr}
; CHECK-NEXT:    adds r1, r2, #3
; CHECK-NEXT:    bic r1, r1, #3
; CHECK-NEXT:    vmov.i32 q0, #0x0
; CHECK-NEXT:    subs r1, #4
; CHECK-NEXT:    lsrs r1, r1, #2
; CHECK-NEXT:    sub.w r1, r2, r1, lsl #2
; CHECK-NEXT:    dlstp.32 lr, r2
; CHECK-NEXT:  .LBB1_1: @ %vector.body
; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    mov r1, r2
; CHECK-NEXT:    vmov q1, q0
; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
; CHECK-NEXT:    subs r2, #4
; CHECK-NEXT:    vadd.i32 q0, q0, q1
; CHECK-NEXT:    letp lr, .LBB1_1
; CHECK-NEXT:  @ %bb.2: @ %middle.block
@@ -135,14 +141,17 @@ define dso_local i32 @add_reduce_add_const(i32* noalias nocapture readonly %a, i
; CHECK-NEXT:    moveq r0, #0
; CHECK-NEXT:    bxeq lr
; CHECK-NEXT:    push {r7, lr}
; CHECK-NEXT:    adds r1, r2, #3
; CHECK-NEXT:    bic r1, r1, #3
; CHECK-NEXT:    vmov.i32 q0, #0x0
; CHECK-NEXT:    subs r1, #4
; CHECK-NEXT:    lsrs r1, r1, #2
; CHECK-NEXT:    sub.w r1, r2, r1, lsl #2
; CHECK-NEXT:    dlstp.32 lr, r2
; CHECK-NEXT:  .LBB2_1: @ %vector.body
; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    mov r1, r2
; CHECK-NEXT:    vmov q1, q0
; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
; CHECK-NEXT:    subs r2, #4
; CHECK-NEXT:    vadd.i32 q0, q0, q1
; CHECK-NEXT:    letp lr, .LBB2_1
; CHECK-NEXT:  @ %bb.2: @ %middle.block
Loading