Commit 25315359 authored by Changpeng Fang's avatar Changpeng Fang
Browse files

AMDGPU: Implement FDIV optimizations in AMDGPUCodeGenPrepare

    Summary:
      RCP has the accuracy limit. If FDIV fpmath require high accuracy rcp may not
    meet the requirement. However, in DAG lowering, fpmath information gets lost,
    and thus we may generate either inaccurate rcp related computation or slow code
    for fdiv.

    In patch implements fdiv optimizations in the AMDGPUCodeGenPrepare, which could
    exactly know !fpmath.

     FastUnsafeRcpLegal: We determine whether it is legal to use rcp based on
                         unsafe-fp-math, fast math flags, denormals and fpmath
                         accuracy request.

     RCP Optimizations:
       1/x -> rcp(x) when fast unsafe rcp is legal or fpmath >= 2.5ULP with
                                                      denormals flushed.
       a/b -> a*rcp(b) when fast unsafe rcp is legal.

     Use fdiv.fast:
       a/b -> fdiv.fast(a, b) when RCP optimization is not performed and
                              fpmath >= 2.5ULP with denormals flushed.

       1/x -> fdiv.fast(1,x)  when RCP optimization is not performed and
                              fpmath >= 2.5ULP with denormals.

    Reviewers:
      arsenm

    Differential Revision:
      https://reviews.llvm.org/D71293
parent 7ad17e00
Loading
Loading
Loading
Loading
+110 −32
Original line number Diff line number Diff line
@@ -606,12 +606,64 @@ bool AMDGPUCodeGenPrepare::foldBinOpIntoSelect(BinaryOperator &BO) const {
  return true;
}

static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv, bool HasDenormals) {
// Perform RCP optimizations:
//
// 1/x -> rcp(x) when fast unsafe rcp is legal or fpmath >= 2.5ULP with
//                                                denormals flushed.
//
// a/b -> a*rcp(b) when fast unsafe rcp is legal.
static Value *performRCPOpt(Value *Num, Value *Den, bool FastUnsafeRcpLegal,
                            IRBuilder<> Builder, MDNode *FPMath, Module *Mod,
                            bool HasDenormals, bool NeedHighAccuracy) {

  Type *Ty = Den->getType();
  if (!FastUnsafeRcpLegal && Ty->isFloatTy() &&
                             (HasDenormals || NeedHighAccuracy))
    return nullptr;

  Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_rcp, Ty);
  if (const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num)) {
    if (FastUnsafeRcpLegal || Ty->isFloatTy() || Ty->isHalfTy()) {
      if (CLHS->isExactlyValue(1.0)) {
        // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
        // the CI documentation has a worst case error of 1 ulp.
        // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
        // use it as long as we aren't trying to use denormals.
        //
        // v_rcp_f16 and v_rsq_f16 DO support denormals.

        // NOTE: v_sqrt and v_rcp will be combined to v_rsq later. So we don't
        //       insert rsq intrinsic here.

        // 1.0 / x -> rcp(x)
        return Builder.CreateCall(Decl, { Den });
       }

       // Same as for 1.0, but expand the sign out of the constant.
       if (CLHS->isExactlyValue(-1.0)) {
         // -1.0 / x -> rcp (fneg x)
         Value *FNeg = Builder.CreateFNeg(Den);
         return Builder.CreateCall(Decl, { FNeg });
       }
    }
  }

  if (FastUnsafeRcpLegal) {
    // Turn into multiply by the reciprocal.
    // x / y -> x * (1.0 / y)
    Value *Recip = Builder.CreateCall(Decl, { Den });
    return Builder.CreateFMul(Num, Recip, "", FPMath);
  }
  return nullptr;
}

static bool shouldKeepFDivF32(Value *Num, bool FastUnsafeRcpLegal,
                              bool HasDenormals) {
  const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);
  if (!CNum)
    return HasDenormals;

  if (UnsafeDiv)
  if (FastUnsafeRcpLegal)
    return true;

  bool IsOne = CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0);
@@ -620,44 +672,57 @@ static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv, bool HasDenormals) {
  return HasDenormals ^ IsOne;
}

// Insert an intrinsic for fast fdiv for safe math situations where we can
// reduce precision. Leave fdiv for situations where the generic node is
// expected to be optimized.

// Optimizations is performed based on fpmath, fast math flags as wells as
// denormals to lower fdiv using either rcp or fdiv.fast.
//
// FastUnsafeRcpLegal: We determine whether it is legal to use rcp based on
//                     unsafe-fp-math, fast math flags, denormals and fpmath
//                     accuracy request.
//
// RCP Optimizations:
//   1/x -> rcp(x) when fast unsafe rcp is legal or fpmath >= 2.5ULP with
//                                                  denormals flushed.
//   a/b -> a*rcp(b) when fast unsafe rcp is legal.
//
// Use fdiv.fast:
//   a/b -> fdiv.fast(a, b) when RCP optimization is not performed and
//                          fpmath >= 2.5ULP with denormals flushed.
//
//   1/x -> fdiv.fast(1,x)  when RCP optimization is not performed and
//                          fpmath >= 2.5ULP with denormals.
bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
  Type *Ty = FDiv.getType();

  if (!Ty->getScalarType()->isFloatTy())
    return false;
  Type *Ty = FDiv.getType()->getScalarType();

  MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath);
  if (!FPMath)
  // No intrinsic for fdiv16 if target does not support f16.
  if (Ty->isHalfTy() && !ST->has16BitInsts())
    return false;

  const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
  float ULP = FPOp->getFPAccuracy();
  if (ULP < 2.5f)
    return false;
  MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath);
  const bool NeedHighAccuracy = !FPMath || FPOp->getFPAccuracy() < 2.5f;

  FastMathFlags FMF = FPOp->getFastMathFlags();
  bool UnsafeDiv = HasUnsafeFPMath || FMF.isFast() ||
                                      FMF.allowReciprocal();
  // Determine whether it is ok to use rcp based on unsafe-fp-math,
  // fast math flags, denormals and accuracy request.
  const bool FastUnsafeRcpLegal = HasUnsafeFPMath || FMF.isFast() ||
          (FMF.allowReciprocal() && ((!HasFP32Denormals && !NeedHighAccuracy)
                                     || FMF.approxFunc()));

  // With UnsafeDiv node will be optimized to just rcp and mul.
  if (UnsafeDiv)
    return false;
  // Use fdiv.fast for only f32, fpmath >= 2.5ULP and rcp is not used.
  const bool UseFDivFast = Ty->isFloatTy() && !NeedHighAccuracy &&
                           !FastUnsafeRcpLegal;

  IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath);
  IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()));
  Builder.setFastMathFlags(FMF);
  Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());

  Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast);

  Value *Num = FDiv.getOperand(0);
  Value *Den = FDiv.getOperand(1);

  Value *NewFDiv = nullptr;

  if (VectorType *VT = dyn_cast<VectorType>(Ty)) {
  if (VectorType *VT = dyn_cast<VectorType>(FDiv.getType())) {
    NewFDiv = UndefValue::get(VT);

    // FIXME: Doesn't do the right thing for cases where the vector is partially
@@ -665,19 +730,32 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
    for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) {
      Value *NumEltI = Builder.CreateExtractElement(Num, I);
      Value *DenEltI = Builder.CreateExtractElement(Den, I);
      Value *NewElt;

      if (shouldKeepFDivF32(NumEltI, UnsafeDiv, HasFP32Denormals)) {
        NewElt = Builder.CreateFDiv(NumEltI, DenEltI);
      } else {
        NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI });
      }
      Value *NewElt = nullptr;
      if (UseFDivFast && !shouldKeepFDivF32(NumEltI, FastUnsafeRcpLegal,
                                           HasFP32Denormals)) {
        Function *Decl =
                 Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast);
        NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI }, "", FPMath);
      }
      if (!NewElt) // Try rcp.
        NewElt = performRCPOpt(NumEltI, DenEltI, FastUnsafeRcpLegal, Builder,
                               FPMath, Mod, HasFP32Denormals, NeedHighAccuracy);
      if (!NewElt)
        NewElt = Builder.CreateFDiv(NumEltI, DenEltI, "", FPMath);

      NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I);
    }
  } else {
    if (!shouldKeepFDivF32(Num, UnsafeDiv, HasFP32Denormals))
      NewFDiv = Builder.CreateCall(Decl, { Num, Den });
  } else { // Scalar.
    if (UseFDivFast && !shouldKeepFDivF32(Num, FastUnsafeRcpLegal,
                                          HasFP32Denormals)) {
      Function *Decl =
               Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast);
      NewFDiv = Builder.CreateCall(Decl, { Num, Den }, "", FPMath);
    }
    if (!NewFDiv) { // Try rcp.
      NewFDiv = performRCPOpt(Num, Den, FastUnsafeRcpLegal, Builder, FPMath,
                              Mod, HasFP32Denormals, NeedHighAccuracy);
    }
  }

  if (NewFDiv) {
+44 −34
Original line number Diff line number Diff line
@@ -7474,13 +7474,23 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
  SDValue RHS = Op.getOperand(1);
  EVT VT = Op.getValueType();
  const SDNodeFlags Flags = Op->getFlags();
  bool Unsafe = DAG.getTarget().Options.UnsafeFPMath || Flags.hasAllowReciprocal();

  if (!Unsafe && VT == MVT::f32 && hasFP32Denormals(DAG.getMachineFunction()))
  bool FastUnsafeRcpLegal = DAG.getTarget().Options.UnsafeFPMath ||
         (Flags.hasAllowReciprocal() &&
          ((VT == MVT::f32 && hasFP32Denormals(DAG.getMachineFunction())) ||
            VT == MVT::f16 ||
            Flags.hasApproximateFuncs()));

  // Do rcp optimization only when fast unsafe rcp is legal here.
  // NOTE: We already performed RCP optimization to insert intrinsics in
  // AMDGPUCodeGenPrepare. Ideally there should have no opportunity here to
  // rcp optimization.
  //   However, there are cases like FREM, which is expended into a sequence
  // of instructions including FDIV, which may expose new opportunities.
  if (!FastUnsafeRcpLegal)
    return SDValue();

  if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
    if (Unsafe || VT == MVT::f32 || VT == MVT::f16) {
    if (CLHS->isExactlyValue(1.0)) {
      // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
      // the CI documentation has a worst case error of 1 ulp.
@@ -7507,18 +7517,13 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
      return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
    }
  }
  }

  if (Unsafe) {
  // Turn into multiply by the reciprocal.
  // x / y -> x * (1.0 / y)
  SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
  return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
}

  return SDValue();
}

static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
                          EVT VT, SDValue A, SDValue B, SDValue GlueChain) {
  if (GlueChain->getNumValues() <= 1) {
@@ -8663,6 +8668,11 @@ SDValue SITargetLowering::performRcpCombine(SDNode *N,
                           N->getFlags());
  }

  if ((VT == MVT::f32 || VT == MVT::f16) && N0.getOpcode() == ISD::FSQRT) {
    return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT,
                           N0.getOperand(0), N->getFlags());
  }

  return AMDGPUTargetLowering::performRcpCombine(N, DCI);
}

+171 −27

File changed.

Preview size limit exceeded, changes collapsed.

+62 −0
Original line number Diff line number Diff line
@@ -284,6 +284,68 @@ define amdgpu_kernel void @fdiv_v4f32_arcp_math(<4 x float> addrspace(1)* %out,
  ret void
}

; FUNC-LABEL: {{^}}fdiv_f32_correctly_rounded_divide_sqrt:

; GCN: v_div_scale_f32 [[NUM_SCALE:v[0-9]+]]
; GCN-DAG: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]]
; GCN-DAG: v_rcp_f32_e32 [[NUM_RCP:v[0-9]+]], [[NUM_SCALE]]

; PREGFX10: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX10: s_denorm_mode 15
; GCN: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0
; GCN: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]]
; GCN: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]]
; GCN: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]]
; GCN: v_fma_f32 [[E:v[0-9]+]], [[D]], [[B]], [[C]]
; GCN: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]]
; PREGFX10: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; GFX10: s_denorm_mode 12
; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]]
; GCN: v_div_fixup_f32 v{{[0-9]+}}, [[FMAS]],

define amdgpu_kernel void @fdiv_f32_correctly_rounded_divide_sqrt(float addrspace(1)* %out, float %a) #0 {
entry:
  %fdiv = fdiv float 1.000000e+00, %a
  store float %fdiv, float addrspace(1)* %out
  ret void
}


; FUNC-LABEL: {{^}}fdiv_f32_denorms_correctly_rounded_divide_sqrt:

; GCN: v_div_scale_f32 [[NUM_SCALE:v[0-9]+]]
; GCN-DAG: v_rcp_f32_e32 [[NUM_RCP:v[0-9]+]], [[NUM_SCALE]]

; PREGFX10-DAG: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]]
; PREGFX10-NOT: s_setreg
; PREGFX10: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0
; PREGFX10: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]]
; PREGFX10: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]]
; PREGFX10: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]]
; PREGFX10: v_fma_f32 [[E:v[0-9]+]], [[D]], [[B]], [[C]]
; PREGFX10: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]]
; PREGFX10-NOT: s_setreg

; GFX10-NOT: s_denorm_mode
; GFX10: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0
; GFX10: v_fmac_f32_e32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]]
; GFX10: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]]
; GFX10: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]]
; GFX10: v_fma_f32 [[D:v[0-9]+]], [[C]], -[[NUM_SCALE]], [[DEN_SCALE]]
; GFX10: v_fmac_f32_e32 [[E:v[0-9]+]], [[D]], [[B]]
; GFX10: v_fmac_f32_e64 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]]
; GFX10-NOT: s_denorm_mode

; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]]
; GCN: v_div_fixup_f32 v{{[0-9]+}}, [[FMAS]],
define amdgpu_kernel void @fdiv_f32_denorms_correctly_rounded_divide_sqrt(float addrspace(1)* %out, float %a) #2 {
entry:
  %fdiv = fdiv float 1.000000e+00, %a
  store float %fdiv, float addrspace(1)* %out
  ret void
}


attributes #0 = { nounwind "enable-unsafe-fp-math"="false" "target-features"="-fp32-denormals,+fp64-fp16-denormals,-flat-for-global" }
attributes #1 = { nounwind "enable-unsafe-fp-math"="true" "target-features"="-fp32-denormals,-flat-for-global" }
attributes #2 = { nounwind "enable-unsafe-fp-math"="false" "target-features"="+fp32-denormals,-flat-for-global" }
+24 −40
Original line number Diff line number Diff line
@@ -348,7 +348,7 @@ define amdgpu_kernel void @div_v_by_x_25ulp(float addrspace(1)* %arg, float %num
; GCN: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
define amdgpu_kernel void @div_1_by_x_fast(float addrspace(1)* %arg) {
  %load = load float, float addrspace(1)* %arg, align 4
  %div = fdiv fast float 1.000000e+00, %load
  %div = fdiv fast float 1.000000e+00, %load, !fpmath !0
  store float %div, float addrspace(1)* %arg, align 4
  ret void
}
@@ -359,7 +359,7 @@ define amdgpu_kernel void @div_1_by_x_fast(float addrspace(1)* %arg) {
; GCN: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
define amdgpu_kernel void @div_minus_1_by_x_fast(float addrspace(1)* %arg) {
  %load = load float, float addrspace(1)* %arg, align 4
  %div = fdiv fast float -1.000000e+00, %load
  %div = fdiv fast float -1.000000e+00, %load, !fpmath !0
  store float %div, float addrspace(1)* %arg, align 4
  ret void
}
@@ -370,7 +370,7 @@ define amdgpu_kernel void @div_minus_1_by_x_fast(float addrspace(1)* %arg) {
; GCN: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
define amdgpu_kernel void @div_1_by_minus_x_fast(float addrspace(1)* %arg) {
  %load = load float, float addrspace(1)* %arg, align 4
  %neg = fsub float -0.000000e+00, %load
  %neg = fsub float -0.000000e+00, %load, !fpmath !0
  %div = fdiv fast float 1.000000e+00, %neg
  store float %div, float addrspace(1)* %arg, align 4
  ret void
@@ -382,22 +382,18 @@ define amdgpu_kernel void @div_1_by_minus_x_fast(float addrspace(1)* %arg) {
; GCN: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
define amdgpu_kernel void @div_minus_1_by_minus_x_fast(float addrspace(1)* %arg) {
  %load = load float, float addrspace(1)* %arg, align 4
  %neg = fsub float -0.000000e+00, %load
  %neg = fsub float -0.000000e+00, %load, !fpmath !0
  %div = fdiv fast float -1.000000e+00, %neg
  store float %div, float addrspace(1)* %arg, align 4
  ret void
}

; GCN-LABEL: {{^}}div_1_by_x_correctly_rounded:
; GCN-DENORM-DAG: v_div_scale_f32
; GCN-DENORM-DAG: v_rcp_f32_e32
; GCN-DENORM-DAG: v_div_scale_f32
; GCN-DENORM:     v_div_fmas_f32
; GCN-DENORM:     v_div_fixup_f32

; GCN-FLUSH: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0
; GCN-FLUSH: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[VAL]]
; GCN-FLUSH: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
; GCN-DAG: v_div_scale_f32
; GCN-DAG: v_rcp_f32_e32
; GCN-DAG: v_div_scale_f32
; GCN:     v_div_fmas_f32
; GCN:     v_div_fixup_f32
define amdgpu_kernel void @div_1_by_x_correctly_rounded(float addrspace(1)* %arg) {
  %load = load float, float addrspace(1)* %arg, align 4
  %div = fdiv float 1.000000e+00, %load
@@ -406,15 +402,11 @@ define amdgpu_kernel void @div_1_by_x_correctly_rounded(float addrspace(1)* %arg
}

; GCN-LABEL: {{^}}div_minus_1_by_x_correctly_rounded:
; GCN-DENORM-DAG: v_div_scale_f32
; GCN-DENORM-DAG: v_rcp_f32_e32
; GCN-DENORM-DAG: v_div_scale_f32
; GCN-DENORM:     v_div_fmas_f32
; GCN-DENORM:     v_div_fixup_f32

; GCN-FLUSH: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0
; GCN-FLUSH: v_rcp_f32_e64 [[RCP:v[0-9]+]], -[[VAL]]
; GCN-FLUSH: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
; GCN-DAG: v_div_scale_f32
; GCN-DAG: v_rcp_f32_e32
; GCN-DAG: v_div_scale_f32
; GCN:     v_div_fmas_f32
; GCN:     v_div_fixup_f32
define amdgpu_kernel void @div_minus_1_by_x_correctly_rounded(float addrspace(1)* %arg) {
  %load = load float, float addrspace(1)* %arg, align 4
  %div = fdiv float -1.000000e+00, %load
@@ -423,15 +415,11 @@ define amdgpu_kernel void @div_minus_1_by_x_correctly_rounded(float addrspace(1)
}

; GCN-LABEL: {{^}}div_1_by_minus_x_correctly_rounded:
; GCN-DENORM-DAG: v_div_scale_f32
; GCN-DENORM-DAG: v_rcp_f32_e32
; GCN-DENORM-DAG: v_div_scale_f32
; GCN-DENORM:     v_div_fmas_f32
; GCN-DENORM:     v_div_fixup_f32

; GCN-FLUSH: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0
; GCN-FLUSH: v_rcp_f32_e64 [[RCP:v[0-9]+]], -[[VAL]]
; GCN-FLUSH: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
; GCN-DAG: v_div_scale_f32
; GCN-DAG: v_rcp_f32_e32
; GCN-DAG: v_div_scale_f32
; GCN:     v_div_fmas_f32
; GCN:     v_div_fixup_f32
define amdgpu_kernel void @div_1_by_minus_x_correctly_rounded(float addrspace(1)* %arg) {
  %load = load float, float addrspace(1)* %arg, align 4
  %neg = fsub float -0.000000e+00, %load
@@ -441,15 +429,11 @@ define amdgpu_kernel void @div_1_by_minus_x_correctly_rounded(float addrspace(1)
}

; GCN-LABEL: {{^}}div_minus_1_by_minus_x_correctly_rounded:
; GCN-DENORM-DAG: v_div_scale_f32
; GCN-DENORM-DAG: v_rcp_f32_e32
; GCN-DENORM-DAG: v_div_scale_f32
; GCN-DENORM:     v_div_fmas_f32
; GCN-DENORM:     v_div_fixup_f32

; GCN-FLUSH: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0
; GCN-FLUSH: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[VAL]]
; GCN-FLUSH: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
; GCN-DAG: v_div_scale_f32
; GCN-DAG: v_rcp_f32_e32
; GCN-DAG: v_div_scale_f32
; GCN:     v_div_fmas_f32
; GCN:     v_div_fixup_f32
define amdgpu_kernel void @div_minus_1_by_minus_x_correctly_rounded(float addrspace(1)* %arg) {
  %load = load float, float addrspace(1)* %arg, align 4
  %neg = fsub float -0.000000e+00, %load
Loading