[ARM] Implement TTI::isHardwareLoopProfitable (757ac02d) · Commits · Cabrera, Anthony / llvm-project

llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp

+194 −0

Original line number	Diff line number	Diff line
		@@ -36,6 +36,10 @@ using namespace llvm;

		#define DEBUG_TYPE "armtti"

		static cl::opt<bool> DisableLowOverheadLoops(
		"disable-arm-loloops", cl::Hidden, cl::init(true),
		cl::desc("Disable the generation of low-overhead loops"));

		bool ARMTTIImpl::areInlineCompatible(const Function *Caller,
		const Function *Callee) const {
		const TargetMachine &TM = getTLI()->getTargetMachine();
		@@ -628,6 +632,196 @@ int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
		UseMaskForCond, UseMaskForGaps);
		}

		bool ARMTTIImpl::isLoweredToCall(const Function *F) {
		if (!F->isIntrinsic())
		BaseT::isLoweredToCall(F);

		// Assume all Arm-specific intrinsics map to an instruction.
		if (F->getName().startswith("llvm.arm"))
		return false;

		switch (F->getIntrinsicID()) {
		default: break;
		case Intrinsic::powi:
		case Intrinsic::sin:
		case Intrinsic::cos:
		case Intrinsic::pow:
		case Intrinsic::log:
		case Intrinsic::log10:
		case Intrinsic::log2:
		case Intrinsic::exp:
		case Intrinsic::exp2:
		return true;
		case Intrinsic::sqrt:
		case Intrinsic::fabs:
		case Intrinsic::copysign:
		case Intrinsic::floor:
		case Intrinsic::ceil:
		case Intrinsic::trunc:
		case Intrinsic::rint:
		case Intrinsic::nearbyint:
		case Intrinsic::round:
		case Intrinsic::canonicalize:
		case Intrinsic::lround:
		case Intrinsic::llround:
		case Intrinsic::lrint:
		case Intrinsic::llrint:
		if (F->getReturnType()->isDoubleTy() && !ST->hasFP64())
		return true;
		if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16())
		return true;
		// Some operations can be handled by vector instructions and assume
		// unsupported vectors will be expanded into supported scalar ones.
		// TODO Handle scalar operations properly.
		return !ST->hasFPARMv8Base() && !ST->hasVFP2Base();
		case Intrinsic::masked_store:
		case Intrinsic::masked_load:
		case Intrinsic::masked_gather:
		case Intrinsic::masked_scatter:
		return !ST->hasMVEIntegerOps();
		case Intrinsic::sadd_with_overflow:
		case Intrinsic::uadd_with_overflow:
		case Intrinsic::ssub_with_overflow:
		case Intrinsic::usub_with_overflow:
		case Intrinsic::sadd_sat:
		case Intrinsic::uadd_sat:
		case Intrinsic::ssub_sat:
		case Intrinsic::usub_sat:
		return false;
		}

		return BaseT::isLoweredToCall(F);
		}

		bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
		AssumptionCache &AC,
		TargetLibraryInfo *LibInfo,
		TTI::HardwareLoopInfo &HWLoopInfo) {
		// Low-overhead branches are only supported in the 'low-overhead branch'
		// extension of v8.1-m.
		if (!ST->hasLOB() \|\| DisableLowOverheadLoops)
		return false;

		// For now, for simplicity, only support loops with one exit block.
		if (!L->getExitBlock())
		return false;

		if (!SE.hasLoopInvariantBackedgeTakenCount(L))
		return false;

		const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
		if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
		return false;

		const SCEV *TripCountSCEV =
		SE.getAddExpr(BackedgeTakenCount,
		SE.getOne(BackedgeTakenCount->getType()));

		// We need to store the trip count in LR, a 32-bit register.
		if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32)
		return false;

		// Making a call will trash LR and clear LO_BRANCH_INFO, so there's little
		// point in generating a hardware loop if that's going to happen.
		auto MaybeCall = [this](Instruction &I) {
		const ARMTargetLowering *TLI = getTLI();
		unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode());
		EVT VT = TLI->getValueType(DL, I.getType(), true);
		if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall)
		return true;

		// Check if an intrinsic will be lowered to a call and assume that any
		// other CallInst will generate a bl.
		if (auto *Call = dyn_cast<CallInst>(&I)) {
		if (isa<IntrinsicInst>(Call)) {
		if (const Function *F = Call->getCalledFunction())
		return isLoweredToCall(F);
		}
		return true;
		}

		// FPv5 provides conversions between integer, double-precision,
		// single-precision, and half-precision formats.
		switch (I.getOpcode()) {
		default:
		break;
		case Instruction::FPToSI:
		case Instruction::FPToUI:
		case Instruction::SIToFP:
		case Instruction::UIToFP:
		case Instruction::FPTrunc:
		case Instruction::FPExt:
		return !ST->hasFPARMv8Base();
		}

		// FIXME: Unfortunately the approach of checking the Operation Action does
		// not catch all cases of Legalization that use library calls. Our
		// Legalization step categorizes some transformations into library calls as
		// Custom, Expand or even Legal when doing type legalization. So for now
		// we have to special case for instance the SDIV of 64bit integers and the
		// use of floating point emulation.
		if (VT.isInteger() && VT.getSizeInBits() >= 64) {
		switch (ISD) {
		default:
		break;
		case ISD::SDIV:
		case ISD::UDIV:
		case ISD::SREM:
		case ISD::UREM:
		case ISD::SDIVREM:
		case ISD::UDIVREM:
		return true;
		}
		}

		// Assume all other non-float operations are supported.
		if (!VT.isFloatingPoint())
		return false;

		// We'll need a library call to handle most floats when using soft.
		if (TLI->useSoftFloat()) {
		switch (I.getOpcode()) {
		default:
		return true;
		case Instruction::Alloca:
		case Instruction::Load:
		case Instruction::Store:
		case Instruction::Select:
		case Instruction::PHI:
		return false;
		}
		}

		// We'll need a libcall to perform double precision operations on a single
		// precision only FPU.
		if (I.getType()->isDoubleTy() && !ST->hasFP64())
		return true;

		// Likewise for half precision arithmetic.
		if (I.getType()->isHalfTy() && !ST->hasFullFP16())
		return true;

		return false;
		};

		// Scan the instructions to see if there's any that we know will turn into a
		// call.
		for (auto *BB : L->getBlocks())
		for (auto &I : *BB)
		if (MaybeCall(I))
		return false;

		// TODO: Check whether the trip count calculation is expensive. If L is the
		// inner loop but we know it has a low trip count, calculating that trip
		// count (in the parent loop) may be detrimental.

		LLVMContext &C = L->getHeader()->getContext();
		HWLoopInfo.CounterInReg = true;
		HWLoopInfo.CountType = Type::getInt32Ty(C);
		HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
		return true;
		}

		void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
		TTI::UnrollingPreferences &UP) {
		// Only currently enable these preferences for M-Class cores.

llvm/lib/Target/ARM/ARMTargetTransformInfo.h

+6 −0

Original line number	Diff line number	Diff line
		@@ -180,6 +180,12 @@ public:
		bool UseMaskForCond = false,
		bool UseMaskForGaps = false);

		bool isLoweredToCall(const Function *F);
		bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
		AssumptionCache &AC,
		TargetLibraryInfo *LibInfo,
		TTI::HardwareLoopInfo &HWLoopInfo);

		void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
		TTI::UnrollingPreferences &UP);

llvm/test/Transforms/HardwareLoops/ARM/calls.ll

0 → 100644

+404 −0

Original line number	Diff line number	Diff line
		; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -hardware-loops -disable-arm-loloops=false %s -S -o - \| FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-MAIN
		; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+fullfp16 -hardware-loops -disable-arm-loloops=false %s -S -o - \| FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FP
		; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+fp-armv8,+fullfp16 -hardware-loops -disable-arm-loloops=false %s -S -o - \| FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FP64
		; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve -hardware-loops -disable-arm-loloops=false %s -S -o - \| FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-MVE
		; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp -hardware-loops -disable-arm-loloops=false %s -S -o - \| FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-MVEFP


		; CHECK-LABEL: skip_call
		; CHECK-NOT: call void @llvm.set.loop.iterations
		; CHECK-NOT: call i32 @llvm.loop.decrement

		define i32 @skip_call(i32 %n) {
		entry:
		%cmp6 = icmp eq i32 %n, 0
		br i1 %cmp6, label %while.end, label %while.body.preheader

		while.body.preheader:
		br label %while.body

		while.body:
		%i.08 = phi i32 [ %inc1, %while.body ], [ 0, %while.body.preheader ]
		%res.07 = phi i32 [ %add, %while.body ], [ 0, %while.body.preheader ]
		%call = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #2
		%add = add nsw i32 %call, %res.07
		%inc1 = add nuw i32 %i.08, 1
		%exitcond = icmp eq i32 %inc1, %n
		br i1 %exitcond, label %while.end.loopexit, label %while.body

		while.end.loopexit:
		br label %while.end

		while.end:
		%res.0.lcssa = phi i32 [ 0, %entry ], [ %add, %while.end.loopexit ]
		ret i32 %res.0.lcssa
		}

		; CHECK-LABEL: test_target_specific
		; CHECK: call void @llvm.set.loop.iterations.i32(i32 50)
		; CHECK: [[COUNT:%[^ ]+]] = phi i32 [ 50, %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ]
		; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[COUNT]], i32 1)
		; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
		; CHECK: br i1 [[CMP]], label %loop, label %exit

		define i32 @test_target_specific(i32* %a, i32* %b) {
		entry:
		br label %loop
		loop:
		%acc = phi i32 [ 0, %entry ], [ %res, %loop ]
		%count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
		%addr.a = getelementptr i32, i32* %a, i32 %count
		%addr.b = getelementptr i32, i32* %b, i32 %count
		%load.a = load i32, i32* %addr.a
		%load.b = load i32, i32* %addr.b
		%res = call i32 @llvm.arm.smlad(i32 %load.a, i32 %load.b, i32 %acc)
		%count.next = add nuw i32 %count, 2
		%cmp = icmp ne i32 %count.next, 100
		br i1 %cmp, label %loop, label %exit
		exit:
		ret i32 %res
		}

		; CHECK-LABEL: test_fabs_f16
		; CHECK-MAIN-NOT: call void @llvm.set.loop.iterations
		; CHECK-MVE-NOT: call void @llvm.set.loop.iterations
		; CHECK-FP: call void @llvm.set.loop.iterations.i32(i32 100)
		; CHECK-MVEFP: call void @llvm.set.loop.iterations.i32(i32 100)
		define void @test_fabs_f16(half* %a, half* %b) {
		entry:
		br label %loop
		loop:
		%count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
		%addr.a = getelementptr half, half* %a, i32 %count
		%load.a = load half, half* %addr.a
		%abs = call half @llvm.fabs.f16(half %load.a)
		%addr.b = getelementptr half, half* %b, i32 %count
		store half %abs, half *%addr.b
		%count.next = add nuw i32 %count, 1
		%cmp = icmp ne i32 %count.next, 100
		br i1 %cmp, label %loop, label %exit
		exit:
		ret void
		}

		; CHECK-LABEL: test_fabs
		; CHECK-MAIN-NOT: call void @llvm.set.loop.iterations
		; CHECK-MVE-NOT: call void @llvm.set.loop.iterations
		; CHECK-FP: call void @llvm.set.loop.iterations.i32(i32 100)
		; CHECK-MVEFP: call void @llvm.set.loop.iterations.i32(i32 100)
		define float @test_fabs(float* %a) {
		entry:
		br label %loop
		loop:
		%acc = phi float [ 0.0, %entry ], [ %res, %loop ]
		%count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
		%addr.a = getelementptr float, float* %a, i32 %count
		%load.a = load float, float* %addr.a
		%abs = call float @llvm.fabs.f32(float %load.a)
		%res = fadd float %abs, %acc
		%count.next = add nuw i32 %count, 1
		%cmp = icmp ne i32 %count.next, 100
		br i1 %cmp, label %loop, label %exit
		exit:
		ret float %res
		}

		; CHECK-LABEL: test_fabs_64
		; CHECK-MAIN-NOT: call void @llvm.set.loop.iterations
		; CHECK-MVE-NOT: call void @llvm.set.loop.iterations
		; CHECK-FP-NOT: call void @llvm.set.loop.iterations.i32(i32 100)
		; CHECK-FP64: void @llvm.set.loop.iterations.i32(i32 100)
		; CHECK-MVEFP-NOT: call void @llvm.set.loop.iterations.i32(i32 100)
		define void @test_fabs_64(double* %a, double* %b) {
		entry:
		br label %loop
		loop:
		%count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
		%addr.a = getelementptr double, double* %a, i32 %count
		%load.a = load double, double* %addr.a
		%abs = call double @llvm.fabs.f64(double %load.a)
		%addr.b = getelementptr double, double* %b, i32 %count
		store double %abs, double *%addr.b
		%count.next = add nuw i32 %count, 1
		%cmp = icmp ne i32 %count.next, 100
		br i1 %cmp, label %loop, label %exit
		exit:
		ret void
		}

		; CHECK-LABEL: test_fabs_vec
		; CHECK-MVE-NOT: call void @llvm.set.loop.iterations
		; CHECK-MVEFP: call void @llvm.set.loop.iterations.i32(i32 100)
		; CHECK-MVEFP: [[COUNT:%[^ ]+]] = phi i32 [ 100, %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ]
		; CHECK-MVEFP: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[COUNT]], i32 1)
		; CHECK-MVEFP: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
		; CHECK-MVEFP: br i1 [[CMP]], label %loop, label %exit
		define <4 x float> @test_fabs_vec(<4 x float>* %a) {
		entry:
		br label %loop
		loop:
		%acc = phi <4 x float> [ zeroinitializer, %entry ], [ %res, %loop ]
		%count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
		%addr.a = getelementptr <4 x float>, <4 x float>* %a, i32 %count
		%load.a = load <4 x float>, <4 x float>* %addr.a
		%abs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %load.a)
		%res = fadd <4 x float> %abs, %acc
		%count.next = add nuw i32 %count, 1
		%cmp = icmp ne i32 %count.next, 100
		br i1 %cmp, label %loop, label %exit
		exit:
		ret <4 x float> %res
		}

		; CHECK-LABEL: test_log
		; CHECK-NOT: call void @llvm.set.loop.iterations
		; CHECK-NOT: llvm.loop.decrement
		define float @test_log(float* %a) {
		entry:
		br label %loop
		loop:
		%acc = phi float [ 0.0, %entry ], [ %res, %loop ]
		%count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
		%addr.a = getelementptr float, float* %a, i32 %count
		%load.a = load float, float* %addr.a
		%abs = call float @llvm.log.f32(float %load.a)
		%res = fadd float %abs, %acc
		%count.next = add nuw i32 %count, 1
		%cmp = icmp ne i32 %count.next, 100
		br i1 %cmp, label %loop, label %exit
		exit:
		ret float %res
		}

		; CHECK-LABEL: test_sqrt_16
		; CHECK-MAIN-NOT: call void @llvm.set.loop.iterations
		; CHECK-MVE-NOT: call void @llvm.set.loop.iterations
		; CHECK-FP: call void @llvm.set.loop.iterations.i32(i32 100)
		; CHECK-MVEFP: call void @llvm.set.loop.iterations.i32(i32 100)
		; CHECK-FP64: call void @llvm.set.loop.iterations.i32(i32 100)
		define void @test_sqrt_16(half* %a, half* %b) {
		entry:
		br label %loop
		loop:
		%count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
		%addr.a = getelementptr half, half* %a, i32 %count
		%load.a = load half, half* %addr.a
		%sqrt = call half @llvm.sqrt.f16(half %load.a)
		%addr.b = getelementptr half, half* %b, i32 %count
		store half %sqrt, half *%addr.b
		%count.next = add nuw i32 %count, 1
		%cmp = icmp ne i32 %count.next, 100
		br i1 %cmp, label %loop, label %exit
		exit:
		ret void
		}
		; CHECK-LABEL: test_sqrt
		; CHECK-MAIN-NOT: call void @llvm.set.loop.iterations
		; CHECK-MVE-NOT: call void @llvm.set.loop.iterations
		; CHECK-FP: call void @llvm.set.loop.iterations
		; CHECK-MVEFP: call void @llvm.set.loop.iterations.i32(i32 100)
		; CHECK-MVEFP: [[COUNT:%[^ ]+]] = phi i32 [ 100, %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ]
		; CHECK-MVEFP: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[COUNT]], i32 1)
		; CHECK-MVEFP: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
		; CHECK-MVEFP: br i1 [[CMP]], label %loop, label %exit
		define void @test_sqrt(float* %a, float* %b) {
		entry:
		br label %loop
		loop:
		%count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
		%addr.a = getelementptr float, float* %a, i32 %count
		%load.a = load float, float* %addr.a
		%sqrt = call float @llvm.sqrt.f32(float %load.a)
		%addr.b = getelementptr float, float* %b, i32 %count
		store float %sqrt, float* %addr.b
		%count.next = add nuw i32 %count, 1
		%cmp = icmp ne i32 %count.next, 100
		br i1 %cmp, label %loop, label %exit
		exit:
		ret void
		}

		; CHECK-LABEL: test_sqrt_64
		; CHECK-MAIN-NOT: call void @llvm.set.loop.iterations
		; CHECK-MVE-NOT: call void @llvm.set.loop.iterations
		; CHECK-FP-NOT: call void @llvm.set.loop.iterations.i32(i32 100)
		; CHECK-MVEFP-NOT: call void @llvm.set.loop.iterations.i32(i32 100)
		; CHECK-FP64: call void @llvm.set.loop.iterations.i32(i32 100)
		define void @test_sqrt_64(double* %a, double* %b) {
		entry:
		br label %loop
		loop:
		%count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
		%addr.a = getelementptr double, double* %a, i32 %count
		%load.a = load double, double* %addr.a
		%sqrt = call double @llvm.sqrt.f64(double %load.a)
		%addr.b = getelementptr double, double* %b, i32 %count
		store double %sqrt, double *%addr.b
		%count.next = add nuw i32 %count, 1
		%cmp = icmp ne i32 %count.next, 100
		br i1 %cmp, label %loop, label %exit
		exit:
		ret void
		}

		; CHECK-LABEL: test_sqrt_vec
		; CHECK-MAIN-NOT: call void @llvm.set.loop.iterations
		; CHECK-MVE-NOT: call void @llvm.set.loop.iterations
		; CHECK-FP: call void @llvm.set.loop.iterations.i32(i32 100)
		; CHECK-MVEFP: call void @llvm.set.loop.iterations.i32(i32 100)
		define void @test_sqrt_vec(<4 x float>* %a, <4 x float>* %b) {
		entry:
		br label %loop
		loop:
		%count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
		%addr.a = getelementptr <4 x float>, <4 x float>* %a, i32 %count
		%load.a = load <4 x float>, <4 x float>* %addr.a
		%sqrt = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %load.a)
		%addr.b = getelementptr <4 x float>, <4 x float>* %b, i32 %count
		store <4 x float> %sqrt, <4 x float>* %addr.b
		%count.next = add nuw i32 %count, 1
		%cmp = icmp ne i32 %count.next, 100
		br i1 %cmp, label %loop, label %exit
		exit:
		ret void
		}

		; CHECK-LABEL: test_overflow
		; CHECK: call void @llvm.set.loop.iterations
		define i32 @test_overflow(i32* %a, i32* %b) {
		entry:
		br label %loop
		loop:
		%acc = phi i32 [ 0, %entry ], [ %res, %loop ]
		%count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
		%addr.a = getelementptr i32, i32* %a, i32 %count
		%addr.b = getelementptr i32, i32* %b, i32 %count
		%load.a = load i32, i32* %addr.a
		%load.b = load i32, i32* %addr.b
		%sadd = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %load.a, i32 %load.b)
		%res = extractvalue {i32, i1} %sadd, 0
		%count.next = add nuw i32 %count, 1
		%cmp = icmp ne i32 %count.next, 100
		br i1 %cmp, label %loop, label %exit
		exit:
		ret i32 %res
		}

		; TODO: We should be able to generate a qadd/sub
		; CHECK-LABEL: test_sat
		; CHECK: call void @llvm.set.loop.iterations.i32(i32 100)
		define i32 @test_sat(i32* %a, i32* %b) {
		entry:
		br label %loop
		loop:
		%acc = phi i32 [ 0, %entry ], [ %res, %loop ]
		%count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
		%addr.a = getelementptr i32, i32* %a, i32 %count
		%addr.b = getelementptr i32, i32* %b, i32 %count
		%load.a = load i32, i32* %addr.a
		%load.b = load i32, i32* %addr.b
		%res = call i32 @llvm.sadd.sat.i32(i32 %load.a, i32 %load.b)
		%count.next = add nuw i32 %count, 1
		%cmp = icmp ne i32 %count.next, 100
		br i1 %cmp, label %loop, label %exit
		exit:
		ret i32 %res
		}

		; CHECK-LABEL: test_masked_i32
		; CHECK-NOT: call void @llvm.set.loop.iterations
		; CHECK-MVEFP: call void @llvm.set.loop.iterations
		; CHECK-MVE: call void @llvm.set.loop.iterations.i32(i32 100)
		; CHECK-MVE: [[COUNT:%[^ ]+]] = phi i32 [ 100, %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ]
		; CHECK-MVE: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[COUNT]], i32 1)
		; CHECK-MVE: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
		; CHECK-MVE: br i1 [[CMP]], label %loop, label %exit
		define void @test_masked_i32(<4 x i1> %mask, <4 x i32>* %a, <4 x i32>* %b, <4 x i32>* %c, <4 x i32> %passthru) {
		entry:
		br label %loop
		loop:
		%count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
		%addr.a = getelementptr <4 x i32>, <4 x i32>* %a, i32 %count
		%addr.b = getelementptr <4 x i32>, <4 x i32>* %b, i32 %count
		%addr.c = getelementptr <4 x i32>, <4 x i32>* %c, i32 %count
		%load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr.a, i32 4, <4 x i1> %mask, <4 x i32> %passthru)
		%load.b = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr.b, i32 4, <4 x i1> %mask, <4 x i32> %passthru)
		%res = add <4 x i32> %load.a, %load.b
		call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %res, <4 x i32>* %addr.c, i32 4, <4 x i1> %mask)
		%count.next = add nuw i32 %count, 1
		%cmp = icmp ne i32 %count.next, 100
		br i1 %cmp, label %loop, label %exit
		exit:
		ret void
		}

		; CHECK-LABEL: test_masked_f32
		; CHECK-NOT: call void @llvm.set.loop.iterations
		; CHECK-MVEFP: call void @llvm.set.loop.iterations
		; CHECK-MVE: call void @llvm.set.loop.iterations.i32(i32 100)
		; CHECK-MVE: [[COUNT:%[^ ]+]] = phi i32 [ 100, %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ]
		; CHECK-MVE: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[COUNT]], i32 1)
		; CHECK-MVE: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
		; CHECK-MVE: br i1 [[CMP]], label %loop, label %exit
		define void @test_masked_f32(<4 x i1> %mask, <4 x float>* %a, <4 x float>* %b, <4 x float>* %c, <4 x float> %passthru) {
		entry:
		br label %loop
		loop:
		%count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
		%addr.a = getelementptr <4 x float>, <4 x float>* %a, i32 %count
		%addr.b = getelementptr <4 x float>, <4 x float>* %b, i32 %count
		%addr.c = getelementptr <4 x float>, <4 x float>* %c, i32 %count
		%load.a = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr.a, i32 4, <4 x i1> %mask, <4 x float> %passthru)
		%load.b = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr.b, i32 4, <4 x i1> %mask, <4 x float> %passthru)
		%res = fadd <4 x float> %load.a, %load.b
		call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %res, <4 x float>* %addr.c, i32 4, <4 x i1> %mask)
		%count.next = add nuw i32 %count, 1
		%cmp = icmp ne i32 %count.next, 100
		br i1 %cmp, label %loop, label %exit
		exit:
		ret void
		}

		; CHECK-LABEL: test_gather_scatter
		; CHECK-NOT: call void @llvm.set.loop.iterations
		; CHECK-MVEFP: call void @llvm.set.loop.iterations
		; CHECK-MVE: call void @llvm.set.loop.iterations.i32(i32 100)
		; CHECK-MVE: [[COUNT:%[^ ]+]] = phi i32 [ 100, %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ]
		; CHECK-MVE: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[COUNT]], i32 1)
		; CHECK-MVE: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
		; CHECK-MVE: br i1 [[CMP]], label %loop, label %exit
		define void @test_gather_scatter(<4 x i1> %mask, <4 x float> %a, <4 x float> %b, <4 x float*> %c, <4 x float> %passthru) {
		entry:
		br label %loop
		loop:
		%count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
		%load.a = call <4 x float> @llvm.masked.gather.v4f32.p0v4f32(<4 x float*> %a, i32 4, <4 x i1> %mask, <4 x float> %passthru)
		%load.b = call <4 x float> @llvm.masked.gather.v4f32.p0v4f32(<4 x float*> %b, i32 4, <4 x i1> %mask, <4 x float> %passthru)
		%res = fadd <4 x float> %load.a, %load.b
		call void @llvm.masked.scatter.v4f32.p0v4f32(<4 x float> %res, <4 x float*> %c, i32 4, <4 x i1> %mask)
		%count.next = add nuw i32 %count, 1
		%cmp = icmp ne i32 %count.next, 100
		br i1 %cmp, label %loop, label %exit
		exit:
		ret void
		}

		declare i32 @bar(...) local_unnamed_addr #1
		declare i32 @llvm.arm.smlad(i32, i32, i32)
		declare half @llvm.fabs.f16(half)
		declare float @llvm.fabs.f32(float)
		declare double @llvm.fabs.f64(double)
		declare float @llvm.log.f32(float)
		declare <4 x float> @llvm.fabs.v4f32(<4 x float>)
		declare half @llvm.sqrt.f16(half)
		declare float @llvm.sqrt.f32(float)
		declare double @llvm.sqrt.f64(double)
		declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
		declare i32 @llvm.sadd.sat.i32(i32, i32)
		declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32)
		declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)
		declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>)
		declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>)
		declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32, <4 x i1>)
		declare <4 x float> @llvm.masked.gather.v4f32.p0v4f32(<4 x float*>, i32, <4 x i1>, <4 x float>)
		declare void @llvm.masked.scatter.v4f32.p0v4f32(<4 x float>, <4 x float*>, i32, <4 x i1>)

llvm/test/Transforms/HardwareLoops/ARM/counter.ll

0 → 100644

+35 −0

File added.

Preview size limit exceeded, changes collapsed.

llvm/test/Transforms/HardwareLoops/ARM/do-rem.ll

0 → 100644

+259 −0

File added.

Preview size limit exceeded, changes collapsed.