AMDGPU: Enhancement on FDIV lowering in AMDGPUCodeGenPrepare (884acbb9) · Commits · llvm-doe / llvm-project

llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp

+74 −70

Original line number	Diff line number	Diff line
		@@ -606,24 +606,23 @@ bool AMDGPUCodeGenPrepare::foldBinOpIntoSelect(BinaryOperator &BO) const {
		return true;
		}

		// Perform RCP optimizations:
		// Optimize fdiv with rcp:
		//
		// 1/x -> rcp(x) when fast unsafe rcp is legal or fpmath >= 2.5ULP with
		// denormals flushed.
		// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
		// allowed with unsafe-fp-math or afn.
		//
		// a/b -> a*rcp(b) when fast unsafe rcp is legal.
		static Value performRCPOpt(Value Num, Value *Den, bool FastUnsafeRcpLegal,
		IRBuilder<> Builder, MDNode FPMath, Module Mod,
		bool HasDenormals, bool NeedHighAccuracy) {
		// a/b -> a*rcp(b) when inaccurate rcp is allowed with unsafe-fp-math or afn.
		static Value optimizeWithRcp(Value Num, Value *Den, bool AllowInaccurateRcp,
		bool RcpIsAccurate, IRBuilder<> Builder,
		Module *Mod) {

		Type *Ty = Den->getType();
		if (!FastUnsafeRcpLegal && Ty->isFloatTy() &&
		(HasDenormals \|\| NeedHighAccuracy))
		if (!AllowInaccurateRcp && !RcpIsAccurate)
		return nullptr;

		Type *Ty = Den->getType();
		Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_rcp, Ty);
		if (const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num)) {
		if (FastUnsafeRcpLegal \|\| Ty->isFloatTy() \|\| Ty->isHalfTy()) {
		if (AllowInaccurateRcp \|\| RcpIsAccurate) {
		if (CLHS->isExactlyValue(1.0)) {
		// v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
		// the CI documentation has a worst case error of 1 ulp.
		@@ -648,49 +647,63 @@ static Value performRCPOpt(Value Num, Value *Den, bool FastUnsafeRcpLegal,
		}
		}

		if (FastUnsafeRcpLegal) {
		if (AllowInaccurateRcp) {
		// Turn into multiply by the reciprocal.
		// x / y -> x * (1.0 / y)
		Value *Recip = Builder.CreateCall(Decl, { Den });
		return Builder.CreateFMul(Num, Recip, "", FPMath);
		return Builder.CreateFMul(Num, Recip);
		}
		return nullptr;
		}

		static bool shouldKeepFDivF32(Value *Num, bool FastUnsafeRcpLegal,
		bool HasDenormals) {
		const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);
		if (!CNum)
		return HasDenormals;

		if (FastUnsafeRcpLegal)
		return true;
		// optimize with fdiv.fast:
		//
		// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
		//
		// 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
		//
		// NOTE: optimizeWithRcp should be tried first because rcp is the preference.
		static Value optimizeWithFDivFast(Value Num, Value *Den, float ReqdAccuracy,
		bool HasDenormals, IRBuilder<> Builder,
		Module *Mod) {
		// fdiv.fast can achieve 2.5 ULP accuracy.
		if (ReqdAccuracy < 2.5f)
		return nullptr;

		bool IsOne = CNum->isExactlyValue(+1.0) \|\| CNum->isExactlyValue(-1.0);
		// Only have fdiv.fast for f32.
		Type *Ty = Den->getType();
		if (!Ty->isFloatTy())
		return nullptr;

		// Reciprocal f32 is handled separately without denormals.
		return HasDenormals ^ IsOne;
		bool NumIsOne = false;
		if (const ConstantFP *CNum = dyn_cast<ConstantFP>(Num)) {
		if (CNum->isExactlyValue(+1.0) \|\| CNum->isExactlyValue(-1.0))
		NumIsOne = true;
		}

		// fdiv does not support denormals. But 1.0/x is always fine to use it.
		if (HasDenormals && !NumIsOne)
		return nullptr;

		Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast);
		return Builder.CreateCall(Decl, { Num, Den });
		}

		// Optimizations is performed based on fpmath, fast math flags as wells as
		// denormals to lower fdiv using either rcp or fdiv.fast.
		// Optimizations is performed based on fpmath, fast math flags as well as
		// denormals to optimize fdiv with either rcp or fdiv.fast.
		//
		// With rcp:
		// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
		// allowed with unsafe-fp-math or afn.
		//
		// FastUnsafeRcpLegal: We determine whether it is legal to use rcp based on
		// unsafe-fp-math, fast math flags, denormals and fpmath
		// accuracy request.
		// a/b -> a*rcp(b) when inaccurate rcp is allowed with unsafe-fp-math or afn.
		//
		// RCP Optimizations:
		// 1/x -> rcp(x) when fast unsafe rcp is legal or fpmath >= 2.5ULP with
		// denormals flushed.
		// a/b -> a*rcp(b) when fast unsafe rcp is legal.
		// With fdiv.fast:
		// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
		//
		// Use fdiv.fast:
		// a/b -> fdiv.fast(a, b) when RCP optimization is not performed and
		// fpmath >= 2.5ULP with denormals flushed.
		// 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
		//
		// 1/x -> fdiv.fast(1,x) when RCP optimization is not performed and
		// fpmath >= 2.5ULP with denormals.
		// NOTE: rcp is the preference in cases that both are legal.
		bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {

		Type *Ty = FDiv.getType()->getScalarType();
		@@ -700,19 +713,17 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
		return false;

		const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
		MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath);
		const bool NeedHighAccuracy = !FPMath \|\| FPOp->getFPAccuracy() < 2.5f;
		const float ReqdAccuracy = FPOp->getFPAccuracy();

		// Inaccurate rcp is allowed with unsafe-fp-math or afn.
		FastMathFlags FMF = FPOp->getFastMathFlags();
		// Determine whether it is ok to use rcp based on unsafe-fp-math,
		// fast math flags, denormals and accuracy request.
		const bool FastUnsafeRcpLegal = HasUnsafeFPMath \|\| FMF.isFast() \|\|
		(FMF.allowReciprocal() && ((!HasFP32Denormals && !NeedHighAccuracy)
		\|\| FMF.approxFunc()));
		const bool AllowInaccurateRcp = HasUnsafeFPMath \|\| FMF.approxFunc();

		// Use fdiv.fast for only f32, fpmath >= 2.5ULP and rcp is not used.
		const bool UseFDivFast = Ty->isFloatTy() && !NeedHighAccuracy &&
		!FastUnsafeRcpLegal;
		// rcp_f16 is accurate for !fpmath >= 1.0ulp.
		// rcp_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
		// rcp_f64 is never accurate.
		const bool RcpIsAccurate = (Ty->isHalfTy() && ReqdAccuracy >= 1.0f) \|\|
		(Ty->isFloatTy() && !HasFP32Denormals && ReqdAccuracy >= 1.0f);

		IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()));
		Builder.setFastMathFlags(FMF);
		@@ -730,31 +741,24 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
		for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) {
		Value *NumEltI = Builder.CreateExtractElement(Num, I);
		Value *DenEltI = Builder.CreateExtractElement(Den, I);
		Value *NewElt = nullptr;
		if (UseFDivFast && !shouldKeepFDivF32(NumEltI, FastUnsafeRcpLegal,
		HasFP32Denormals)) {
		Function *Decl =
		Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast);
		NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI }, "", FPMath);
		}
		if (!NewElt) // Try rcp.
		NewElt = performRCPOpt(NumEltI, DenEltI, FastUnsafeRcpLegal, Builder,
		FPMath, Mod, HasFP32Denormals, NeedHighAccuracy);
		if (!NewElt)
		NewElt = Builder.CreateFDiv(NumEltI, DenEltI, "", FPMath);
		// Try rcp first.
		Value *NewElt = optimizeWithRcp(NumEltI, DenEltI, AllowInaccurateRcp,
		RcpIsAccurate, Builder, Mod);
		if (!NewElt) // Try fdiv.fast.
		NewElt = optimizeWithFDivFast(NumEltI, DenEltI, ReqdAccuracy,
		HasFP32Denormals, Builder, Mod);
		if (!NewElt) // Keep the original.
		NewElt = Builder.CreateFDiv(NumEltI, DenEltI);

		NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I);
		}
		} else { // Scalar.
		if (UseFDivFast && !shouldKeepFDivF32(Num, FastUnsafeRcpLegal,
		HasFP32Denormals)) {
		Function *Decl =
		Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast);
		NewFDiv = Builder.CreateCall(Decl, { Num, Den }, "", FPMath);
		}
		if (!NewFDiv) { // Try rcp.
		NewFDiv = performRCPOpt(Num, Den, FastUnsafeRcpLegal, Builder, FPMath,
		Mod, HasFP32Denormals, NeedHighAccuracy);
		} else { // Scalar FDiv.
		// Try rcp first.
		NewFDiv = optimizeWithRcp(Num, Den, AllowInaccurateRcp, RcpIsAccurate,
		Builder, Mod);
		if (!NewFDiv) { // Try fdiv.fast.
		NewFDiv = optimizeWithFDivFast(Num, Den, ReqdAccuracy, HasFP32Denormals,
		Builder, Mod);
		}
		}

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

+6 −13

Original line number	Diff line number	Diff line
		@@ -7418,19 +7418,12 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
		EVT VT = Op.getValueType();
		const SDNodeFlags Flags = Op->getFlags();

		bool FastUnsafeRcpLegal = DAG.getTarget().Options.UnsafeFPMath \|\|
		(Flags.hasAllowReciprocal() &&
		((VT == MVT::f32 && hasFP32Denormals(DAG.getMachineFunction())) \|\|
		VT == MVT::f16 \|\|
		Flags.hasApproximateFuncs()));

		// Do rcp optimization only when fast unsafe rcp is legal here.
		// NOTE: We already performed RCP optimization to insert intrinsics in
		// AMDGPUCodeGenPrepare. Ideally there should have no opportunity here to
		// rcp optimization.
		// However, there are cases like FREM, which is expended into a sequence
		// of instructions including FDIV, which may expose new opportunities.
		if (!FastUnsafeRcpLegal)
		bool AllowInaccurateRcp = DAG.getTarget().Options.UnsafeFPMath \|\|
		Flags.hasApproximateFuncs();

		// Without !fpmath accuracy information, we can't do more because we don't
		// know exactly whether rcp is accurate enough to meet !fpmath requirement.
		if (!AllowInaccurateRcp)
		return SDValue();

		if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {

llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll

+86 −111

File changed.

Preview size limit exceeded, changes collapsed.

llvm/test/CodeGen/AMDGPU/fdiv.f16.ll

+45 −22

Original line number	Diff line number	Diff line
		@@ -63,7 +63,7 @@ entry:
		%gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext
		%gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
		%b.val = load volatile half, half addrspace(1)* %gep.b
		%r.val = fdiv half 1.0, %b.val
		%r.val = fdiv half 1.0, %b.val, !fpmath !0
		store half %r.val, half addrspace(1)* %gep.r
		ret void
		}
		@@ -82,25 +82,46 @@ entry:
		%gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
		%b.val = load volatile half, half addrspace(1)* %gep.b
		%b.abs = call half @llvm.fabs.f16(half %b.val)
		%r.val = fdiv half 1.0, %b.abs
		%r.val = fdiv half 1.0, %b.abs, !fpmath !0
		store half %r.val, half addrspace(1)* %gep.r
		ret void
		}

		; GCN-LABEL: {{^}}v_rcp_f16_arcp:
		; We could not do 1/b -> rcp_f16(b) under !fpmath < 1ulp.

		; GCN-LABEL: {{^}}reciprocal_f16_rounded:
		; GFX8_9_10: {{flat\|global}}_load_ushort [[VAL16:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}
		; GFX8_9_10: v_cvt_f32_f16_e32 [[CVT_TO32:v[0-9]+]], [[VAL16]]
		; GFX8_9_10: v_rcp_f32_e32 [[RCP32:v[0-9]+]], [[CVT_TO32]]
		; GFX8_9_10: v_cvt_f16_f32_e32 [[CVT_BACK16:v[0-9]+]], [[RCP32]]
		; GFX8_9_10: v_div_fixup_f16 [[RESULT:v[0-9]+]], [[CVT_BACK16]], [[VAL16]], 1.0
		; GFX8_9_10: {{flat\|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
		define amdgpu_kernel void @reciprocal_f16_rounded(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
		entry:
		%tid = call i32 @llvm.amdgcn.workitem.id.x()
		%tid.ext = sext i32 %tid to i64
		%gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext
		%gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
		%b.val = load volatile half, half addrspace(1)* %gep.b
		%r.val = fdiv half 1.0, %b.val
		store half %r.val, half addrspace(1)* %gep.r
		ret void
		}

		; GCN-LABEL: {{^}}v_rcp_f16_afn:
		; GFX8_9_10: {{flat\|global}}_load_ushort [[VAL:v[0-9]+]]
		; GFX8_9_10-NOT: [[VAL]]
		; GFX8_9_10: v_rcp_f16_e32 [[RESULT:v[0-9]+]], [[VAL]]
		; GFX8_9_10-NOT: [[RESULT]]
		; GFX8_9_10: {{flat\|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
		define amdgpu_kernel void @v_rcp_f16_arcp(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
		define amdgpu_kernel void @v_rcp_f16_afn(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
		entry:
		%tid = call i32 @llvm.amdgcn.workitem.id.x()
		%tid.ext = sext i32 %tid to i64
		%gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext
		%gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
		%b.val = load volatile half, half addrspace(1)* %gep.b
		%r.val = fdiv arcp half 1.0, %b.val
		%r.val = fdiv afn half 1.0, %b.val, !fpmath !0
		store half %r.val, half addrspace(1)* %gep.r
		ret void
		}
		@@ -118,7 +139,7 @@ entry:
		%gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext
		%gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
		%b.val = load volatile half, half addrspace(1)* %gep.b
		%r.val = fdiv half -1.0, %b.val
		%r.val = fdiv half -1.0, %b.val, !fpmath !0
		store half %r.val, half addrspace(1)* %gep.r
		ret void
		}
		@@ -137,7 +158,7 @@ entry:
		%gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
		%b.val = load volatile half, half addrspace(1)* %gep.b
		%b.sqrt = call half @llvm.sqrt.f16(half %b.val)
		%r.val = fdiv half 1.0, %b.sqrt
		%r.val = fdiv half 1.0, %b.sqrt, !fpmath !0
		store half %r.val, half addrspace(1)* %gep.r
		ret void
		}
		@@ -157,12 +178,12 @@ entry:
		%gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
		%b.val = load volatile half, half addrspace(1)* %gep.b
		%b.sqrt = call half @llvm.sqrt.f16(half %b.val)
		%r.val = fdiv half -1.0, %b.sqrt
		%r.val = fdiv half -1.0, %b.sqrt, !fpmath !0
		store half %r.val, half addrspace(1)* %gep.r
		ret void
		}

		; GCN-LABEL: {{^}}v_fdiv_f16_arcp:
		; GCN-LABEL: {{^}}v_fdiv_f16_afn:
		; GFX8_9_10: {{flat\|global}}_load_ushort [[LHS:v[0-9]+]]
		; GFX8_9_10: {{flat\|global}}_load_ushort [[RHS:v[0-9]+]]

		@@ -170,7 +191,7 @@ entry:
		; GFX8_9_10: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[LHS]], [[RCP]]

		; GFX8_9_10: {{flat\|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
		define amdgpu_kernel void @v_fdiv_f16_arcp(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #0 {
		define amdgpu_kernel void @v_fdiv_f16_afn(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #0 {
		entry:
		%tid = call i32 @llvm.amdgcn.workitem.id.x()
		%tid.ext = sext i32 %tid to i64
		@@ -179,7 +200,7 @@ entry:
		%gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
		%a.val = load volatile half, half addrspace(1)* %gep.a
		%b.val = load volatile half, half addrspace(1)* %gep.b
		%r.val = fdiv arcp half %a.val, %b.val
		%r.val = fdiv afn half %a.val, %b.val
		store half %r.val, half addrspace(1)* %gep.r
		ret void
		}
		@@ -206,38 +227,38 @@ entry:
		ret void
		}

		; FUNC-LABEL: {{^}}div_arcp_2_x_pat_f16:
		; FUNC-LABEL: {{^}}div_afn_2_x_pat_f16:
		; SI: v_mul_f32_e32 v{{[0-9]+}}, 0.5, v{{[0-9]+}}

		; GFX8_9_10: v_mul_f16_e32 [[MUL:v[0-9]+]], 0.5, v{{[0-9]+}}
		; GFX8_9_10: buffer_store_short [[MUL]]
		define amdgpu_kernel void @div_arcp_2_x_pat_f16(half addrspace(1)* %out) #0 {
		define amdgpu_kernel void @div_afn_2_x_pat_f16(half addrspace(1)* %out) #0 {
		%x = load half, half addrspace(1)* undef
		%rcp = fdiv arcp half %x, 2.0
		%rcp = fdiv afn half %x, 2.0
		store half %rcp, half addrspace(1)* %out, align 4
		ret void
		}

		; FUNC-LABEL: {{^}}div_arcp_k_x_pat_f16:
		; SI: v_mul_f32_e32 v{{[0-9]+}}, 0x3dccc000, v{{[0-9]+}}
		; FUNC-LABEL: {{^}}div_afn_k_x_pat_f16:
		; SI: v_mul_f32_e32 v{{[0-9]+}}, 0x3dcccccd, v{{[0-9]+}}

		; GFX8_9_10: v_mul_f16_e32 [[MUL:v[0-9]+]], 0x2e66, v{{[0-9]+}}
		; GFX8_9_10: buffer_store_short [[MUL]]
		define amdgpu_kernel void @div_arcp_k_x_pat_f16(half addrspace(1)* %out) #0 {
		define amdgpu_kernel void @div_afn_k_x_pat_f16(half addrspace(1)* %out) #0 {
		%x = load half, half addrspace(1)* undef
		%rcp = fdiv arcp half %x, 10.0
		%rcp = fdiv afn half %x, 10.0
		store half %rcp, half addrspace(1)* %out, align 4
		ret void
		}

		; FUNC-LABEL: {{^}}div_arcp_neg_k_x_pat_f16:
		; SI: v_mul_f32_e32 v{{[0-9]+}}, 0xbdccc000, v{{[0-9]+}}
		; FUNC-LABEL: {{^}}div_afn_neg_k_x_pat_f16:
		; SI: v_mul_f32_e32 v{{[0-9]+}}, 0xbdcccccd, v{{[0-9]+}}

		; GFX8_9_10: v_mul_f16_e32 [[MUL:v[0-9]+]], 0xae66, v{{[0-9]+}}
		; GFX8_9_10: buffer_store_short [[MUL]]
		define amdgpu_kernel void @div_arcp_neg_k_x_pat_f16(half addrspace(1)* %out) #0 {
		define amdgpu_kernel void @div_afn_neg_k_x_pat_f16(half addrspace(1)* %out) #0 {
		%x = load half, half addrspace(1)* undef
		%rcp = fdiv arcp half %x, -10.0
		%rcp = fdiv afn half %x, -10.0
		store half %rcp, half addrspace(1)* %out, align 4
		ret void
		}
		@@ -249,3 +270,5 @@ declare half @llvm.fabs.f16(half) #1
		attributes #0 = { nounwind }
		attributes #1 = { nounwind readnone }
		attributes #2 = { nounwind "unsafe-fp-math"="true" }

		!0 = !{float 2.500000e+00}

Admin message