Commit 757ac02d authored by Sam Parker's avatar Sam Parker
Browse files

[ARM] Implement TTI::isHardwareLoopProfitable

    
Implement the backend target hook to drive the HardwareLoops pass.
The low-overhead branch extension for Arm M-class cores is flexible
enough that we don't have to ensure correctness at this point, except
checking that the loop counter variable can be stored in LR - a
32-bit register. For it to be profitable, we want to avoid loops that
contain function calls, or any other instruction that alters the PC.
    
This implementation uses TargetLoweringInfo, to query type and
operation actions, looks at intrinsic calls and also performs some
manual checks for remainder/division and FP operations.
    
I think this should be a good base to start and extra details can be
filled out later.

Differential Revision: https://reviews.llvm.org/D62907

llvm-svn: 363149
parent 91bb72a3
Loading
Loading
Loading
Loading
+194 −0
Original line number Diff line number Diff line
@@ -36,6 +36,10 @@ using namespace llvm;

#define DEBUG_TYPE "armtti"

static cl::opt<bool> DisableLowOverheadLoops(
  "disable-arm-loloops", cl::Hidden, cl::init(true),
  cl::desc("Disable the generation of low-overhead loops"));

bool ARMTTIImpl::areInlineCompatible(const Function *Caller,
                                     const Function *Callee) const {
  const TargetMachine &TM = getTLI()->getTargetMachine();
@@ -628,6 +632,196 @@ int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
                                           UseMaskForCond, UseMaskForGaps);
}

bool ARMTTIImpl::isLoweredToCall(const Function *F) {
  if (!F->isIntrinsic())
    BaseT::isLoweredToCall(F);

  // Assume all Arm-specific intrinsics map to an instruction.
  if (F->getName().startswith("llvm.arm"))
    return false;

  switch (F->getIntrinsicID()) {
  default: break;
  case Intrinsic::powi:
  case Intrinsic::sin:
  case Intrinsic::cos:
  case Intrinsic::pow:
  case Intrinsic::log:
  case Intrinsic::log10:
  case Intrinsic::log2:
  case Intrinsic::exp:
  case Intrinsic::exp2:
    return true;
  case Intrinsic::sqrt:
  case Intrinsic::fabs:
  case Intrinsic::copysign:
  case Intrinsic::floor:
  case Intrinsic::ceil:
  case Intrinsic::trunc:
  case Intrinsic::rint:
  case Intrinsic::nearbyint:
  case Intrinsic::round:
  case Intrinsic::canonicalize:
  case Intrinsic::lround:
  case Intrinsic::llround:
  case Intrinsic::lrint:
  case Intrinsic::llrint:
    if (F->getReturnType()->isDoubleTy() && !ST->hasFP64())
      return true;
    if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16())
      return true;
    // Some operations can be handled by vector instructions and assume
    // unsupported vectors will be expanded into supported scalar ones.
    // TODO Handle scalar operations properly.
    return !ST->hasFPARMv8Base() && !ST->hasVFP2Base();
  case Intrinsic::masked_store:
  case Intrinsic::masked_load:
  case Intrinsic::masked_gather:
  case Intrinsic::masked_scatter:
    return !ST->hasMVEIntegerOps();
  case Intrinsic::sadd_with_overflow:
  case Intrinsic::uadd_with_overflow:
  case Intrinsic::ssub_with_overflow:
  case Intrinsic::usub_with_overflow:
  case Intrinsic::sadd_sat:
  case Intrinsic::uadd_sat:
  case Intrinsic::ssub_sat:
  case Intrinsic::usub_sat:
    return false;
  }

  return BaseT::isLoweredToCall(F);
}

bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
                                          AssumptionCache &AC,
                                          TargetLibraryInfo *LibInfo,
                                          TTI::HardwareLoopInfo &HWLoopInfo) {
  // Low-overhead branches are only supported in the 'low-overhead branch'
  // extension of v8.1-m.
  if (!ST->hasLOB() || DisableLowOverheadLoops)
    return false;

  // For now, for simplicity, only support loops with one exit block.
  if (!L->getExitBlock())
    return false;

  if (!SE.hasLoopInvariantBackedgeTakenCount(L))
    return false;

  const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
  if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
    return false;

  const SCEV *TripCountSCEV =
    SE.getAddExpr(BackedgeTakenCount,
                  SE.getOne(BackedgeTakenCount->getType()));

  // We need to store the trip count in LR, a 32-bit register.
  if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32)
    return false;

  // Making a call will trash LR and clear LO_BRANCH_INFO, so there's little
  // point in generating a hardware loop if that's going to happen.
  auto MaybeCall = [this](Instruction &I) {
    const ARMTargetLowering *TLI = getTLI();
    unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode());
    EVT VT = TLI->getValueType(DL, I.getType(), true);
    if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall)
      return true;

    // Check if an intrinsic will be lowered to a call and assume that any
    // other CallInst will generate a bl.
    if (auto *Call = dyn_cast<CallInst>(&I)) {
      if (isa<IntrinsicInst>(Call)) {
        if (const Function *F = Call->getCalledFunction())
          return isLoweredToCall(F);
      }
      return true;
    }

    // FPv5 provides conversions between integer, double-precision,
    // single-precision, and half-precision formats.
    switch (I.getOpcode()) {
    default:
      break;
    case Instruction::FPToSI:
    case Instruction::FPToUI:
    case Instruction::SIToFP:
    case Instruction::UIToFP:
    case Instruction::FPTrunc:
    case Instruction::FPExt:
      return !ST->hasFPARMv8Base();
    }

    // FIXME: Unfortunately the approach of checking the Operation Action does
    // not catch all cases of Legalization that use library calls. Our
    // Legalization step categorizes some transformations into library calls as
    // Custom, Expand or even Legal when doing type legalization. So for now
    // we have to special case for instance the SDIV of 64bit integers and the
    // use of floating point emulation.
    if (VT.isInteger() && VT.getSizeInBits() >= 64) {
      switch (ISD) {
      default:
        break;
      case ISD::SDIV:
      case ISD::UDIV:
      case ISD::SREM:
      case ISD::UREM:
      case ISD::SDIVREM:
      case ISD::UDIVREM:
        return true;
      }
    }

    // Assume all other non-float operations are supported.
    if (!VT.isFloatingPoint())
      return false;

    // We'll need a library call to handle most floats when using soft.
    if (TLI->useSoftFloat()) {
      switch (I.getOpcode()) {
      default:
        return true;
      case Instruction::Alloca:
      case Instruction::Load:
      case Instruction::Store:
      case Instruction::Select:
      case Instruction::PHI:
        return false;
      }
    }

    // We'll need a libcall to perform double precision operations on a single
    // precision only FPU.
    if (I.getType()->isDoubleTy() && !ST->hasFP64())
      return true;

    // Likewise for half precision arithmetic.
    if (I.getType()->isHalfTy() && !ST->hasFullFP16())
      return true;

    return false;
  };

  // Scan the instructions to see if there's any that we know will turn into a
  // call.
  for (auto *BB : L->getBlocks())
    for (auto &I : *BB)
      if (MaybeCall(I))
        return false;

  // TODO: Check whether the trip count calculation is expensive. If L is the
  // inner loop but we know it has a low trip count, calculating that trip
  // count (in the parent loop) may be detrimental.

  LLVMContext &C = L->getHeader()->getContext();
  HWLoopInfo.CounterInReg = true;
  HWLoopInfo.CountType = Type::getInt32Ty(C);
  HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
  return true;
}

void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                         TTI::UnrollingPreferences &UP) {
  // Only currently enable these preferences for M-Class cores.
+6 −0
Original line number Diff line number Diff line
@@ -180,6 +180,12 @@ public:
                                 bool UseMaskForCond = false,
                                 bool UseMaskForGaps = false);

  bool isLoweredToCall(const Function *F);
  bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
                                AssumptionCache &AC,
                                TargetLibraryInfo *LibInfo,
                                TTI::HardwareLoopInfo &HWLoopInfo);

  void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                               TTI::UnrollingPreferences &UP);

+404 −0
Original line number Diff line number Diff line
; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -hardware-loops -disable-arm-loloops=false %s -S -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-MAIN
; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+fullfp16 -hardware-loops -disable-arm-loloops=false %s -S -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FP
; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+fp-armv8,+fullfp16 -hardware-loops -disable-arm-loloops=false %s -S -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FP64
; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve -hardware-loops -disable-arm-loloops=false %s -S -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-MVE
; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp -hardware-loops -disable-arm-loloops=false %s -S -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-MVEFP


; CHECK-LABEL: skip_call
; CHECK-NOT: call void @llvm.set.loop.iterations
; CHECK-NOT: call i32 @llvm.loop.decrement

define i32 @skip_call(i32 %n) {
entry:
  %cmp6 = icmp eq i32 %n, 0
  br i1 %cmp6, label %while.end, label %while.body.preheader

while.body.preheader:
  br label %while.body

while.body:
  %i.08 = phi i32 [ %inc1, %while.body ], [ 0, %while.body.preheader ]
  %res.07 = phi i32 [ %add, %while.body ], [ 0, %while.body.preheader ]
  %call = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #2
  %add = add nsw i32 %call, %res.07
  %inc1 = add nuw i32 %i.08, 1
  %exitcond = icmp eq i32 %inc1, %n
  br i1 %exitcond, label %while.end.loopexit, label %while.body

while.end.loopexit:
  br label %while.end

while.end:
  %res.0.lcssa = phi i32 [ 0, %entry ], [ %add, %while.end.loopexit ]
  ret i32 %res.0.lcssa
}

; CHECK-LABEL: test_target_specific
; CHECK: call void @llvm.set.loop.iterations.i32(i32 50)
; CHECK: [[COUNT:%[^ ]+]] = phi i32 [ 50, %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ]
; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[COUNT]], i32 1)
; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
; CHECK: br i1 [[CMP]], label %loop, label %exit

define i32 @test_target_specific(i32* %a, i32* %b) {
entry:
  br label %loop
loop:
  %acc = phi i32 [ 0, %entry ], [ %res, %loop ]
  %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
  %addr.a = getelementptr i32, i32* %a, i32 %count
  %addr.b = getelementptr i32, i32* %b, i32 %count
  %load.a = load i32, i32* %addr.a
  %load.b = load i32, i32* %addr.b
  %res = call i32 @llvm.arm.smlad(i32 %load.a, i32 %load.b, i32 %acc)
  %count.next = add nuw i32 %count, 2
  %cmp = icmp ne i32 %count.next, 100
  br i1 %cmp, label %loop, label %exit
exit:
  ret i32 %res
}

; CHECK-LABEL: test_fabs_f16
; CHECK-MAIN-NOT: call void @llvm.set.loop.iterations 
; CHECK-MVE-NOT:  call void @llvm.set.loop.iterations
; CHECK-FP:       call void @llvm.set.loop.iterations.i32(i32 100)
; CHECK-MVEFP:    call void @llvm.set.loop.iterations.i32(i32 100)
define void @test_fabs_f16(half* %a, half* %b) {
entry:
  br label %loop
loop:
  %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
  %addr.a = getelementptr half, half* %a, i32 %count
  %load.a = load half, half* %addr.a
  %abs = call half @llvm.fabs.f16(half %load.a)
  %addr.b = getelementptr half, half* %b, i32 %count
  store half %abs, half *%addr.b
  %count.next = add nuw i32 %count, 1
  %cmp = icmp ne i32 %count.next, 100
  br i1 %cmp, label %loop, label %exit
exit:
  ret void
}

; CHECK-LABEL: test_fabs
; CHECK-MAIN-NOT: call void @llvm.set.loop.iterations 
; CHECK-MVE-NOT:  call void @llvm.set.loop.iterations
; CHECK-FP:       call void @llvm.set.loop.iterations.i32(i32 100)
; CHECK-MVEFP:    call void @llvm.set.loop.iterations.i32(i32 100)
define float @test_fabs(float* %a) {
entry:
  br label %loop
loop:
  %acc = phi float [ 0.0, %entry ], [ %res, %loop ]
  %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
  %addr.a = getelementptr float, float* %a, i32 %count
  %load.a = load float, float* %addr.a
  %abs = call float @llvm.fabs.f32(float %load.a)
  %res = fadd float %abs, %acc
  %count.next = add nuw i32 %count, 1
  %cmp = icmp ne i32 %count.next, 100
  br i1 %cmp, label %loop, label %exit
exit:
  ret float %res
}

; CHECK-LABEL: test_fabs_64
; CHECK-MAIN-NOT:   call void @llvm.set.loop.iterations 
; CHECK-MVE-NOT:    call void @llvm.set.loop.iterations
; CHECK-FP-NOT:     call void @llvm.set.loop.iterations.i32(i32 100)
; CHECK-FP64:       void @llvm.set.loop.iterations.i32(i32 100)
; CHECK-MVEFP-NOT:  call void @llvm.set.loop.iterations.i32(i32 100)
define void @test_fabs_64(double* %a, double* %b) {
entry:
  br label %loop
loop:
  %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
  %addr.a = getelementptr double, double* %a, i32 %count
  %load.a = load double, double* %addr.a
  %abs = call double @llvm.fabs.f64(double %load.a)
  %addr.b = getelementptr double, double* %b, i32 %count
  store double %abs, double *%addr.b
  %count.next = add nuw i32 %count, 1
  %cmp = icmp ne i32 %count.next, 100
  br i1 %cmp, label %loop, label %exit
exit:
  ret void
}

; CHECK-LABEL: test_fabs_vec
; CHECK-MVE-NOT: call void @llvm.set.loop.iterations
; CHECK-MVEFP: call void @llvm.set.loop.iterations.i32(i32 100)
; CHECK-MVEFP: [[COUNT:%[^ ]+]] = phi i32 [ 100, %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ]
; CHECK-MVEFP: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[COUNT]], i32 1)
; CHECK-MVEFP: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
; CHECK-MVEFP: br i1 [[CMP]], label %loop, label %exit
define <4 x float> @test_fabs_vec(<4 x float>* %a) {
entry:
  br label %loop
loop:
  %acc = phi <4 x float> [ zeroinitializer, %entry ], [ %res, %loop ]
  %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
  %addr.a = getelementptr <4 x float>, <4 x float>* %a, i32 %count
  %load.a = load <4 x float>, <4 x float>* %addr.a
  %abs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %load.a)
  %res = fadd <4 x float> %abs, %acc
  %count.next = add nuw i32 %count, 1
  %cmp = icmp ne i32 %count.next, 100
  br i1 %cmp, label %loop, label %exit
exit:
  ret <4 x float> %res
}

; CHECK-LABEL: test_log
; CHECK-NOT: call void @llvm.set.loop.iterations
; CHECK-NOT: llvm.loop.decrement
define float @test_log(float* %a) {
entry:
  br label %loop
loop:
  %acc = phi float [ 0.0, %entry ], [ %res, %loop ]
  %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
  %addr.a = getelementptr float, float* %a, i32 %count
  %load.a = load float, float* %addr.a
  %abs = call float @llvm.log.f32(float %load.a)
  %res = fadd float %abs, %acc
  %count.next = add nuw i32 %count, 1
  %cmp = icmp ne i32 %count.next, 100
  br i1 %cmp, label %loop, label %exit
exit:
  ret float %res
}

; CHECK-LABEL: test_sqrt_16
; CHECK-MAIN-NOT: call void @llvm.set.loop.iterations 
; CHECK-MVE-NOT:  call void @llvm.set.loop.iterations
; CHECK-FP:       call void @llvm.set.loop.iterations.i32(i32 100)
; CHECK-MVEFP:    call void @llvm.set.loop.iterations.i32(i32 100)
; CHECK-FP64:     call void @llvm.set.loop.iterations.i32(i32 100)
define void @test_sqrt_16(half* %a, half* %b) {
entry:
  br label %loop
loop:
  %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
  %addr.a = getelementptr half, half* %a, i32 %count
  %load.a = load half, half* %addr.a
  %sqrt = call half @llvm.sqrt.f16(half %load.a)
  %addr.b = getelementptr half, half* %b, i32 %count
  store half %sqrt, half *%addr.b
  %count.next = add nuw i32 %count, 1
  %cmp = icmp ne i32 %count.next, 100
  br i1 %cmp, label %loop, label %exit
exit:
  ret void
}
; CHECK-LABEL: test_sqrt
; CHECK-MAIN-NOT: call void @llvm.set.loop.iterations 
; CHECK-MVE-NOT: call void @llvm.set.loop.iterations
; CHECK-FP: call void @llvm.set.loop.iterations
; CHECK-MVEFP: call void @llvm.set.loop.iterations.i32(i32 100)
; CHECK-MVEFP: [[COUNT:%[^ ]+]] = phi i32 [ 100, %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ]
; CHECK-MVEFP: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[COUNT]], i32 1)
; CHECK-MVEFP: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
; CHECK-MVEFP: br i1 [[CMP]], label %loop, label %exit
define void @test_sqrt(float* %a, float* %b) {
entry:
  br label %loop
loop:
  %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
  %addr.a = getelementptr float, float* %a, i32 %count
  %load.a = load float, float* %addr.a
  %sqrt = call float @llvm.sqrt.f32(float %load.a)
  %addr.b = getelementptr float, float* %b, i32 %count
  store float %sqrt, float* %addr.b
  %count.next = add nuw i32 %count, 1
  %cmp = icmp ne i32 %count.next, 100
  br i1 %cmp, label %loop, label %exit
exit:
  ret void
}

; CHECK-LABEL: test_sqrt_64
; CHECK-MAIN-NOT:   call void @llvm.set.loop.iterations 
; CHECK-MVE-NOT:    call void @llvm.set.loop.iterations
; CHECK-FP-NOT:     call void @llvm.set.loop.iterations.i32(i32 100)
; CHECK-MVEFP-NOT:  call void @llvm.set.loop.iterations.i32(i32 100)
; CHECK-FP64:       call void @llvm.set.loop.iterations.i32(i32 100)
define void @test_sqrt_64(double* %a, double* %b) {
entry:
  br label %loop
loop:
  %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
  %addr.a = getelementptr double, double* %a, i32 %count
  %load.a = load double, double* %addr.a
  %sqrt = call double @llvm.sqrt.f64(double %load.a)
  %addr.b = getelementptr double, double* %b, i32 %count
  store double %sqrt, double *%addr.b
  %count.next = add nuw i32 %count, 1
  %cmp = icmp ne i32 %count.next, 100
  br i1 %cmp, label %loop, label %exit
exit:
  ret void
}

; CHECK-LABEL: test_sqrt_vec
; CHECK-MAIN-NOT: call void @llvm.set.loop.iterations 
; CHECK-MVE-NOT:  call void @llvm.set.loop.iterations
; CHECK-FP:       call void @llvm.set.loop.iterations.i32(i32 100)
; CHECK-MVEFP:    call void @llvm.set.loop.iterations.i32(i32 100)
define void @test_sqrt_vec(<4 x float>* %a, <4 x float>* %b) {
entry:
  br label %loop
loop:
  %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
  %addr.a = getelementptr <4 x float>, <4 x float>* %a, i32 %count
  %load.a = load <4 x float>, <4 x float>* %addr.a
  %sqrt = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %load.a)
  %addr.b = getelementptr <4 x float>, <4 x float>* %b, i32 %count
  store <4 x float> %sqrt, <4 x float>* %addr.b
  %count.next = add nuw i32 %count, 1
  %cmp = icmp ne i32 %count.next, 100
  br i1 %cmp, label %loop, label %exit
exit:
  ret void
}

; CHECK-LABEL: test_overflow
; CHECK: call void @llvm.set.loop.iterations
define i32 @test_overflow(i32* %a, i32* %b) {
entry:
  br label %loop
loop:
  %acc = phi i32 [ 0, %entry ], [ %res, %loop ]
  %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
  %addr.a = getelementptr i32, i32* %a, i32 %count
  %addr.b = getelementptr i32, i32* %b, i32 %count
  %load.a = load i32, i32* %addr.a
  %load.b = load i32, i32* %addr.b
  %sadd = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %load.a, i32 %load.b)
  %res = extractvalue {i32, i1} %sadd, 0
  %count.next = add nuw i32 %count, 1
  %cmp = icmp ne i32 %count.next, 100
  br i1 %cmp, label %loop, label %exit
exit:
  ret i32 %res
}

; TODO: We should be able to generate a qadd/sub
; CHECK-LABEL: test_sat
; CHECK: call void @llvm.set.loop.iterations.i32(i32 100)
define i32 @test_sat(i32* %a, i32* %b) {
entry:
  br label %loop
loop:
  %acc = phi i32 [ 0, %entry ], [ %res, %loop ]
  %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
  %addr.a = getelementptr i32, i32* %a, i32 %count
  %addr.b = getelementptr i32, i32* %b, i32 %count
  %load.a = load i32, i32* %addr.a
  %load.b = load i32, i32* %addr.b
  %res = call i32 @llvm.sadd.sat.i32(i32 %load.a, i32 %load.b)
  %count.next = add nuw i32 %count, 1
  %cmp = icmp ne i32 %count.next, 100
  br i1 %cmp, label %loop, label %exit
exit:
  ret i32 %res
}

; CHECK-LABEL: test_masked_i32
; CHECK-NOT: call void @llvm.set.loop.iterations
; CHECK-MVEFP: call void @llvm.set.loop.iterations
; CHECK-MVE: call void @llvm.set.loop.iterations.i32(i32 100)
; CHECK-MVE: [[COUNT:%[^ ]+]] = phi i32 [ 100, %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ]
; CHECK-MVE: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[COUNT]], i32 1)
; CHECK-MVE: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
; CHECK-MVE: br i1 [[CMP]], label %loop, label %exit
define void @test_masked_i32(<4 x i1> %mask, <4 x i32>* %a, <4 x i32>* %b, <4 x i32>* %c, <4 x i32> %passthru) {
entry:
  br label %loop
loop:
  %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
  %addr.a = getelementptr <4 x i32>, <4 x i32>* %a, i32 %count
  %addr.b = getelementptr <4 x i32>, <4 x i32>* %b, i32 %count
  %addr.c = getelementptr <4 x i32>, <4 x i32>* %c, i32 %count
  %load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr.a, i32 4, <4 x i1> %mask, <4 x i32> %passthru)
  %load.b = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr.b, i32 4, <4 x i1> %mask, <4 x i32> %passthru)
  %res = add <4 x i32> %load.a, %load.b
  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %res, <4 x i32>* %addr.c, i32 4, <4 x i1> %mask)
  %count.next = add nuw i32 %count, 1
  %cmp = icmp ne i32 %count.next, 100
  br i1 %cmp, label %loop, label %exit
exit:
  ret void
}

; CHECK-LABEL: test_masked_f32
; CHECK-NOT: call void @llvm.set.loop.iterations
; CHECK-MVEFP: call void @llvm.set.loop.iterations
; CHECK-MVE: call void @llvm.set.loop.iterations.i32(i32 100)
; CHECK-MVE: [[COUNT:%[^ ]+]] = phi i32 [ 100, %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ]
; CHECK-MVE: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[COUNT]], i32 1)
; CHECK-MVE: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
; CHECK-MVE: br i1 [[CMP]], label %loop, label %exit
define void @test_masked_f32(<4 x i1> %mask, <4 x float>* %a, <4 x float>* %b, <4 x float>* %c, <4 x float> %passthru) {
entry:
  br label %loop
loop:
  %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
  %addr.a = getelementptr <4 x float>, <4 x float>* %a, i32 %count
  %addr.b = getelementptr <4 x float>, <4 x float>* %b, i32 %count
  %addr.c = getelementptr <4 x float>, <4 x float>* %c, i32 %count
  %load.a = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr.a, i32 4, <4 x i1> %mask, <4 x float> %passthru)
  %load.b = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr.b, i32 4, <4 x i1> %mask, <4 x float> %passthru)
  %res = fadd <4 x float> %load.a, %load.b
  call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %res, <4 x float>* %addr.c, i32 4, <4 x i1> %mask)
  %count.next = add nuw i32 %count, 1
  %cmp = icmp ne i32 %count.next, 100
  br i1 %cmp, label %loop, label %exit
exit:
  ret void
}

; CHECK-LABEL: test_gather_scatter
; CHECK-NOT: call void @llvm.set.loop.iterations
; CHECK-MVEFP: call void @llvm.set.loop.iterations
; CHECK-MVE: call void @llvm.set.loop.iterations.i32(i32 100)
; CHECK-MVE: [[COUNT:%[^ ]+]] = phi i32 [ 100, %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ]
; CHECK-MVE: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[COUNT]], i32 1)
; CHECK-MVE: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
; CHECK-MVE: br i1 [[CMP]], label %loop, label %exit
define void @test_gather_scatter(<4 x i1> %mask, <4 x float*> %a, <4 x float*> %b, <4 x float*> %c, <4 x float> %passthru) {
entry:
  br label %loop
loop:
  %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
  %load.a = call <4 x float> @llvm.masked.gather.v4f32.p0v4f32(<4 x float*> %a, i32 4, <4 x i1> %mask, <4 x float> %passthru)
  %load.b = call <4 x float> @llvm.masked.gather.v4f32.p0v4f32(<4 x float*> %b, i32 4, <4 x i1> %mask, <4 x float> %passthru)
  %res = fadd <4 x float> %load.a, %load.b
  call void @llvm.masked.scatter.v4f32.p0v4f32(<4 x float> %res, <4 x float*> %c, i32 4, <4 x i1> %mask)
  %count.next = add nuw i32 %count, 1
  %cmp = icmp ne i32 %count.next, 100
  br i1 %cmp, label %loop, label %exit
exit:
  ret void
}

declare i32 @bar(...) local_unnamed_addr #1
declare i32 @llvm.arm.smlad(i32, i32, i32)
declare half @llvm.fabs.f16(half)
declare float @llvm.fabs.f32(float)
declare double @llvm.fabs.f64(double)
declare float @llvm.log.f32(float)
declare <4 x float> @llvm.fabs.v4f32(<4 x float>)
declare half @llvm.sqrt.f16(half)
declare float @llvm.sqrt.f32(float)
declare double @llvm.sqrt.f64(double)
declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
declare i32 @llvm.sadd.sat.i32(i32, i32)
declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32)
declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)
declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>)
declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>)
declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32, <4 x i1>)
declare <4 x float> @llvm.masked.gather.v4f32.p0v4f32(<4 x float*>, i32, <4 x i1>, <4 x float>)
declare void @llvm.masked.scatter.v4f32.p0v4f32(<4 x float>, <4 x float*>, i32, <4 x i1>)
+35 −0

File added.

Preview size limit exceeded, changes collapsed.

+259 −0

File added.

Preview size limit exceeded, changes collapsed.

Loading