AMDGPU: Implement FDIV optimizations in AMDGPUCodeGenPrepare (25315359) · Commits · llvm-doe / llvm-project

llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp

+110 −32

Original line number	Diff line number	Diff line
		@@ -606,12 +606,64 @@ bool AMDGPUCodeGenPrepare::foldBinOpIntoSelect(BinaryOperator &BO) const {
		return true;
		}

		static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv, bool HasDenormals) {
		// Perform RCP optimizations:
		//
		// 1/x -> rcp(x) when fast unsafe rcp is legal or fpmath >= 2.5ULP with
		// denormals flushed.
		//
		// a/b -> a*rcp(b) when fast unsafe rcp is legal.
		static Value performRCPOpt(Value Num, Value *Den, bool FastUnsafeRcpLegal,
		IRBuilder<> Builder, MDNode FPMath, Module Mod,
		bool HasDenormals, bool NeedHighAccuracy) {

		Type *Ty = Den->getType();
		if (!FastUnsafeRcpLegal && Ty->isFloatTy() &&
		(HasDenormals \|\| NeedHighAccuracy))
		return nullptr;

		Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_rcp, Ty);
		if (const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num)) {
		if (FastUnsafeRcpLegal \|\| Ty->isFloatTy() \|\| Ty->isHalfTy()) {
		if (CLHS->isExactlyValue(1.0)) {
		// v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
		// the CI documentation has a worst case error of 1 ulp.
		// OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
		// use it as long as we aren't trying to use denormals.
		//
		// v_rcp_f16 and v_rsq_f16 DO support denormals.

		// NOTE: v_sqrt and v_rcp will be combined to v_rsq later. So we don't
		// insert rsq intrinsic here.

		// 1.0 / x -> rcp(x)
		return Builder.CreateCall(Decl, { Den });
		}

		// Same as for 1.0, but expand the sign out of the constant.
		if (CLHS->isExactlyValue(-1.0)) {
		// -1.0 / x -> rcp (fneg x)
		Value *FNeg = Builder.CreateFNeg(Den);
		return Builder.CreateCall(Decl, { FNeg });
		}
		}
		}

		if (FastUnsafeRcpLegal) {
		// Turn into multiply by the reciprocal.
		// x / y -> x * (1.0 / y)
		Value *Recip = Builder.CreateCall(Decl, { Den });
		return Builder.CreateFMul(Num, Recip, "", FPMath);
		}
		return nullptr;
		}

		static bool shouldKeepFDivF32(Value *Num, bool FastUnsafeRcpLegal,
		bool HasDenormals) {
		const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);
		if (!CNum)
		return HasDenormals;

		if (UnsafeDiv)
		if (FastUnsafeRcpLegal)
		return true;

		bool IsOne = CNum->isExactlyValue(+1.0) \|\| CNum->isExactlyValue(-1.0);
		@@ -620,44 +672,57 @@ static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv, bool HasDenormals) {
		return HasDenormals ^ IsOne;
		}

		// Insert an intrinsic for fast fdiv for safe math situations where we can
		// reduce precision. Leave fdiv for situations where the generic node is
		// expected to be optimized.

		// Optimizations is performed based on fpmath, fast math flags as wells as
		// denormals to lower fdiv using either rcp or fdiv.fast.
		//
		// FastUnsafeRcpLegal: We determine whether it is legal to use rcp based on
		// unsafe-fp-math, fast math flags, denormals and fpmath
		// accuracy request.
		//
		// RCP Optimizations:
		// 1/x -> rcp(x) when fast unsafe rcp is legal or fpmath >= 2.5ULP with
		// denormals flushed.
		// a/b -> a*rcp(b) when fast unsafe rcp is legal.
		//
		// Use fdiv.fast:
		// a/b -> fdiv.fast(a, b) when RCP optimization is not performed and
		// fpmath >= 2.5ULP with denormals flushed.
		//
		// 1/x -> fdiv.fast(1,x) when RCP optimization is not performed and
		// fpmath >= 2.5ULP with denormals.
		bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
		Type *Ty = FDiv.getType();

		if (!Ty->getScalarType()->isFloatTy())
		return false;
		Type *Ty = FDiv.getType()->getScalarType();

		MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath);
		if (!FPMath)
		// No intrinsic for fdiv16 if target does not support f16.
		if (Ty->isHalfTy() && !ST->has16BitInsts())
		return false;

		const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
		float ULP = FPOp->getFPAccuracy();
		if (ULP < 2.5f)
		return false;
		MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath);
		const bool NeedHighAccuracy = !FPMath \|\| FPOp->getFPAccuracy() < 2.5f;

		FastMathFlags FMF = FPOp->getFastMathFlags();
		bool UnsafeDiv = HasUnsafeFPMath \|\| FMF.isFast() \|\|
		FMF.allowReciprocal();
		// Determine whether it is ok to use rcp based on unsafe-fp-math,
		// fast math flags, denormals and accuracy request.
		const bool FastUnsafeRcpLegal = HasUnsafeFPMath \|\| FMF.isFast() \|\|
		(FMF.allowReciprocal() && ((!HasFP32Denormals && !NeedHighAccuracy)
		\|\| FMF.approxFunc()));

		// With UnsafeDiv node will be optimized to just rcp and mul.
		if (UnsafeDiv)
		return false;
		// Use fdiv.fast for only f32, fpmath >= 2.5ULP and rcp is not used.
		const bool UseFDivFast = Ty->isFloatTy() && !NeedHighAccuracy &&
		!FastUnsafeRcpLegal;

		IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath);
		IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()));
		Builder.setFastMathFlags(FMF);
		Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());

		Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast);

		Value *Num = FDiv.getOperand(0);
		Value *Den = FDiv.getOperand(1);

		Value *NewFDiv = nullptr;

		if (VectorType *VT = dyn_cast<VectorType>(Ty)) {
		if (VectorType *VT = dyn_cast<VectorType>(FDiv.getType())) {
		NewFDiv = UndefValue::get(VT);

		// FIXME: Doesn't do the right thing for cases where the vector is partially
		@@ -665,19 +730,32 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
		for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) {
		Value *NumEltI = Builder.CreateExtractElement(Num, I);
		Value *DenEltI = Builder.CreateExtractElement(Den, I);
		Value *NewElt;

		if (shouldKeepFDivF32(NumEltI, UnsafeDiv, HasFP32Denormals)) {
		NewElt = Builder.CreateFDiv(NumEltI, DenEltI);
		} else {
		NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI });
		}
		Value *NewElt = nullptr;
		if (UseFDivFast && !shouldKeepFDivF32(NumEltI, FastUnsafeRcpLegal,
		HasFP32Denormals)) {
		Function *Decl =
		Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast);
		NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI }, "", FPMath);
		}
		if (!NewElt) // Try rcp.
		NewElt = performRCPOpt(NumEltI, DenEltI, FastUnsafeRcpLegal, Builder,
		FPMath, Mod, HasFP32Denormals, NeedHighAccuracy);
		if (!NewElt)
		NewElt = Builder.CreateFDiv(NumEltI, DenEltI, "", FPMath);

		NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I);
		}
		} else {
		if (!shouldKeepFDivF32(Num, UnsafeDiv, HasFP32Denormals))
		NewFDiv = Builder.CreateCall(Decl, { Num, Den });
		} else { // Scalar.
		if (UseFDivFast && !shouldKeepFDivF32(Num, FastUnsafeRcpLegal,
		HasFP32Denormals)) {
		Function *Decl =
		Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast);
		NewFDiv = Builder.CreateCall(Decl, { Num, Den }, "", FPMath);
		}
		if (!NewFDiv) { // Try rcp.
		NewFDiv = performRCPOpt(Num, Den, FastUnsafeRcpLegal, Builder, FPMath,
		Mod, HasFP32Denormals, NeedHighAccuracy);
		}
		}

		if (NewFDiv) {

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

+44 −34

Original line number	Diff line number	Diff line
		@@ -7474,13 +7474,23 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
		SDValue RHS = Op.getOperand(1);
		EVT VT = Op.getValueType();
		const SDNodeFlags Flags = Op->getFlags();
		bool Unsafe = DAG.getTarget().Options.UnsafeFPMath \|\| Flags.hasAllowReciprocal();

		if (!Unsafe && VT == MVT::f32 && hasFP32Denormals(DAG.getMachineFunction()))
		bool FastUnsafeRcpLegal = DAG.getTarget().Options.UnsafeFPMath \|\|
		(Flags.hasAllowReciprocal() &&
		((VT == MVT::f32 && hasFP32Denormals(DAG.getMachineFunction())) \|\|
		VT == MVT::f16 \|\|
		Flags.hasApproximateFuncs()));

		// Do rcp optimization only when fast unsafe rcp is legal here.
		// NOTE: We already performed RCP optimization to insert intrinsics in
		// AMDGPUCodeGenPrepare. Ideally there should have no opportunity here to
		// rcp optimization.
		// However, there are cases like FREM, which is expended into a sequence
		// of instructions including FDIV, which may expose new opportunities.
		if (!FastUnsafeRcpLegal)
		return SDValue();

		if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
		if (Unsafe \|\| VT == MVT::f32 \|\| VT == MVT::f16) {
		if (CLHS->isExactlyValue(1.0)) {
		// v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
		// the CI documentation has a worst case error of 1 ulp.
		@@ -7507,18 +7517,13 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
		return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
		}
		}
		}

		if (Unsafe) {
		// Turn into multiply by the reciprocal.
		// x / y -> x * (1.0 / y)
		SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
		return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
		}

		return SDValue();
		}

		static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
		EVT VT, SDValue A, SDValue B, SDValue GlueChain) {
		if (GlueChain->getNumValues() <= 1) {
		@@ -8663,6 +8668,11 @@ SDValue SITargetLowering::performRcpCombine(SDNode *N,
		N->getFlags());
		}

		if ((VT == MVT::f32 \|\| VT == MVT::f16) && N0.getOpcode() == ISD::FSQRT) {
		return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT,
		N0.getOperand(0), N->getFlags());
		}

		return AMDGPUTargetLowering::performRcpCombine(N, DCI);
		}

llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll

+171 −27

File changed.

Preview size limit exceeded, changes collapsed.

llvm/test/CodeGen/AMDGPU/fdiv.ll

+62 −0

Original line number	Diff line number	Diff line
		@@ -284,6 +284,68 @@ define amdgpu_kernel void @fdiv_v4f32_arcp_math(<4 x float> addrspace(1)* %out,
		ret void
		}

		; FUNC-LABEL: {{^}}fdiv_f32_correctly_rounded_divide_sqrt:

		; GCN: v_div_scale_f32 [[NUM_SCALE:v[0-9]+]]
		; GCN-DAG: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]]
		; GCN-DAG: v_rcp_f32_e32 [[NUM_RCP:v[0-9]+]], [[NUM_SCALE]]

		; PREGFX10: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
		; GFX10: s_denorm_mode 15
		; GCN: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0
		; GCN: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]]
		; GCN: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]]
		; GCN: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]]
		; GCN: v_fma_f32 [[E:v[0-9]+]], [[D]], [[B]], [[C]]
		; GCN: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]]
		; PREGFX10: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
		; GFX10: s_denorm_mode 12
		; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]]
		; GCN: v_div_fixup_f32 v{{[0-9]+}}, [[FMAS]],

		define amdgpu_kernel void @fdiv_f32_correctly_rounded_divide_sqrt(float addrspace(1)* %out, float %a) #0 {
		entry:
		%fdiv = fdiv float 1.000000e+00, %a
		store float %fdiv, float addrspace(1)* %out
		ret void
		}


		; FUNC-LABEL: {{^}}fdiv_f32_denorms_correctly_rounded_divide_sqrt:

		; GCN: v_div_scale_f32 [[NUM_SCALE:v[0-9]+]]
		; GCN-DAG: v_rcp_f32_e32 [[NUM_RCP:v[0-9]+]], [[NUM_SCALE]]

		; PREGFX10-DAG: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]]
		; PREGFX10-NOT: s_setreg
		; PREGFX10: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0
		; PREGFX10: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]]
		; PREGFX10: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]]
		; PREGFX10: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]]
		; PREGFX10: v_fma_f32 [[E:v[0-9]+]], [[D]], [[B]], [[C]]
		; PREGFX10: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]]
		; PREGFX10-NOT: s_setreg

		; GFX10-NOT: s_denorm_mode
		; GFX10: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0
		; GFX10: v_fmac_f32_e32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]]
		; GFX10: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]]
		; GFX10: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]]
		; GFX10: v_fma_f32 [[D:v[0-9]+]], [[C]], -[[NUM_SCALE]], [[DEN_SCALE]]
		; GFX10: v_fmac_f32_e32 [[E:v[0-9]+]], [[D]], [[B]]
		; GFX10: v_fmac_f32_e64 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]]
		; GFX10-NOT: s_denorm_mode

		; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]]
		; GCN: v_div_fixup_f32 v{{[0-9]+}}, [[FMAS]],
		define amdgpu_kernel void @fdiv_f32_denorms_correctly_rounded_divide_sqrt(float addrspace(1)* %out, float %a) #2 {
		entry:
		%fdiv = fdiv float 1.000000e+00, %a
		store float %fdiv, float addrspace(1)* %out
		ret void
		}


		attributes #0 = { nounwind "enable-unsafe-fp-math"="false" "target-features"="-fp32-denormals,+fp64-fp16-denormals,-flat-for-global" }
		attributes #1 = { nounwind "enable-unsafe-fp-math"="true" "target-features"="-fp32-denormals,-flat-for-global" }
		attributes #2 = { nounwind "enable-unsafe-fp-math"="false" "target-features"="+fp32-denormals,-flat-for-global" }

llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll

+24 −40

Original line number	Diff line number	Diff line
		@@ -348,7 +348,7 @@ define amdgpu_kernel void @div_v_by_x_25ulp(float addrspace(1)* %arg, float %num
		; GCN: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
		define amdgpu_kernel void @div_1_by_x_fast(float addrspace(1)* %arg) {
		%load = load float, float addrspace(1)* %arg, align 4
		%div = fdiv fast float 1.000000e+00, %load
		%div = fdiv fast float 1.000000e+00, %load, !fpmath !0
		store float %div, float addrspace(1)* %arg, align 4
		ret void
		}
		@@ -359,7 +359,7 @@ define amdgpu_kernel void @div_1_by_x_fast(float addrspace(1)* %arg) {
		; GCN: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
		define amdgpu_kernel void @div_minus_1_by_x_fast(float addrspace(1)* %arg) {
		%load = load float, float addrspace(1)* %arg, align 4
		%div = fdiv fast float -1.000000e+00, %load
		%div = fdiv fast float -1.000000e+00, %load, !fpmath !0
		store float %div, float addrspace(1)* %arg, align 4
		ret void
		}
		@@ -370,7 +370,7 @@ define amdgpu_kernel void @div_minus_1_by_x_fast(float addrspace(1)* %arg) {
		; GCN: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
		define amdgpu_kernel void @div_1_by_minus_x_fast(float addrspace(1)* %arg) {
		%load = load float, float addrspace(1)* %arg, align 4
		%neg = fsub float -0.000000e+00, %load
		%neg = fsub float -0.000000e+00, %load, !fpmath !0
		%div = fdiv fast float 1.000000e+00, %neg
		store float %div, float addrspace(1)* %arg, align 4
		ret void
		@@ -382,22 +382,18 @@ define amdgpu_kernel void @div_1_by_minus_x_fast(float addrspace(1)* %arg) {
		; GCN: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
		define amdgpu_kernel void @div_minus_1_by_minus_x_fast(float addrspace(1)* %arg) {
		%load = load float, float addrspace(1)* %arg, align 4
		%neg = fsub float -0.000000e+00, %load
		%neg = fsub float -0.000000e+00, %load, !fpmath !0
		%div = fdiv fast float -1.000000e+00, %neg
		store float %div, float addrspace(1)* %arg, align 4
		ret void
		}

		; GCN-LABEL: {{^}}div_1_by_x_correctly_rounded:
		; GCN-DENORM-DAG: v_div_scale_f32
		; GCN-DENORM-DAG: v_rcp_f32_e32
		; GCN-DENORM-DAG: v_div_scale_f32
		; GCN-DENORM: v_div_fmas_f32
		; GCN-DENORM: v_div_fixup_f32

		; GCN-FLUSH: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0
		; GCN-FLUSH: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[VAL]]
		; GCN-FLUSH: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
		; GCN-DAG: v_div_scale_f32
		; GCN-DAG: v_rcp_f32_e32
		; GCN-DAG: v_div_scale_f32
		; GCN: v_div_fmas_f32
		; GCN: v_div_fixup_f32
		define amdgpu_kernel void @div_1_by_x_correctly_rounded(float addrspace(1)* %arg) {
		%load = load float, float addrspace(1)* %arg, align 4
		%div = fdiv float 1.000000e+00, %load
		@@ -406,15 +402,11 @@ define amdgpu_kernel void @div_1_by_x_correctly_rounded(float addrspace(1)* %arg
		}

		; GCN-LABEL: {{^}}div_minus_1_by_x_correctly_rounded:
		; GCN-DENORM-DAG: v_div_scale_f32
		; GCN-DENORM-DAG: v_rcp_f32_e32
		; GCN-DENORM-DAG: v_div_scale_f32
		; GCN-DENORM: v_div_fmas_f32
		; GCN-DENORM: v_div_fixup_f32

		; GCN-FLUSH: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0
		; GCN-FLUSH: v_rcp_f32_e64 [[RCP:v[0-9]+]], -[[VAL]]
		; GCN-FLUSH: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
		; GCN-DAG: v_div_scale_f32
		; GCN-DAG: v_rcp_f32_e32
		; GCN-DAG: v_div_scale_f32
		; GCN: v_div_fmas_f32
		; GCN: v_div_fixup_f32
		define amdgpu_kernel void @div_minus_1_by_x_correctly_rounded(float addrspace(1)* %arg) {
		%load = load float, float addrspace(1)* %arg, align 4
		%div = fdiv float -1.000000e+00, %load
		@@ -423,15 +415,11 @@ define amdgpu_kernel void @div_minus_1_by_x_correctly_rounded(float addrspace(1)
		}

		; GCN-LABEL: {{^}}div_1_by_minus_x_correctly_rounded:
		; GCN-DENORM-DAG: v_div_scale_f32
		; GCN-DENORM-DAG: v_rcp_f32_e32
		; GCN-DENORM-DAG: v_div_scale_f32
		; GCN-DENORM: v_div_fmas_f32
		; GCN-DENORM: v_div_fixup_f32

		; GCN-FLUSH: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0
		; GCN-FLUSH: v_rcp_f32_e64 [[RCP:v[0-9]+]], -[[VAL]]
		; GCN-FLUSH: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
		; GCN-DAG: v_div_scale_f32
		; GCN-DAG: v_rcp_f32_e32
		; GCN-DAG: v_div_scale_f32
		; GCN: v_div_fmas_f32
		; GCN: v_div_fixup_f32
		define amdgpu_kernel void @div_1_by_minus_x_correctly_rounded(float addrspace(1)* %arg) {
		%load = load float, float addrspace(1)* %arg, align 4
		%neg = fsub float -0.000000e+00, %load
		@@ -441,15 +429,11 @@ define amdgpu_kernel void @div_1_by_minus_x_correctly_rounded(float addrspace(1)
		}

		; GCN-LABEL: {{^}}div_minus_1_by_minus_x_correctly_rounded:
		; GCN-DENORM-DAG: v_div_scale_f32
		; GCN-DENORM-DAG: v_rcp_f32_e32
		; GCN-DENORM-DAG: v_div_scale_f32
		; GCN-DENORM: v_div_fmas_f32
		; GCN-DENORM: v_div_fixup_f32

		; GCN-FLUSH: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0
		; GCN-FLUSH: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[VAL]]
		; GCN-FLUSH: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
		; GCN-DAG: v_div_scale_f32
		; GCN-DAG: v_rcp_f32_e32
		; GCN-DAG: v_div_scale_f32
		; GCN: v_div_fmas_f32
		; GCN: v_div_fixup_f32
		define amdgpu_kernel void @div_minus_1_by_minus_x_correctly_rounded(float addrspace(1)* %arg) {
		%load = load float, float addrspace(1)* %arg, align 4
		%neg = fsub float -0.000000e+00, %load