Commit 884acbb9 authored by Changpeng Fang's avatar Changpeng Fang
Browse files

AMDGPU: Enhancement on FDIV lowering in AMDGPUCodeGenPrepare

Summary:
  The accuracy limit to use rcp is adjusted to 1.0 ulp from 2.5 ulp.
Also, afn instead of arcp is used to allow inaccurate rcp to be used.

Reviewers:
  arsenm

Differential Revision: https://reviews.llvm.org/D73588
parent 65209760
Loading
Loading
Loading
Loading
+74 −70
Original line number Diff line number Diff line
@@ -606,24 +606,23 @@ bool AMDGPUCodeGenPrepare::foldBinOpIntoSelect(BinaryOperator &BO) const {
  return true;
}

// Perform RCP optimizations:
// Optimize fdiv with rcp:
//
// 1/x -> rcp(x) when fast unsafe rcp is legal or fpmath >= 2.5ULP with
//                                                denormals flushed.
// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
//               allowed with unsafe-fp-math or afn.
//
// a/b -> a*rcp(b) when fast unsafe rcp is legal.
static Value *performRCPOpt(Value *Num, Value *Den, bool FastUnsafeRcpLegal,
                            IRBuilder<> Builder, MDNode *FPMath, Module *Mod,
                            bool HasDenormals, bool NeedHighAccuracy) {
// a/b -> a*rcp(b) when inaccurate rcp is allowed with unsafe-fp-math or afn.
static Value *optimizeWithRcp(Value *Num, Value *Den, bool AllowInaccurateRcp,
                              bool RcpIsAccurate, IRBuilder<> Builder,
                              Module *Mod) {

  Type *Ty = Den->getType();
  if (!FastUnsafeRcpLegal && Ty->isFloatTy() &&
                             (HasDenormals || NeedHighAccuracy))
  if (!AllowInaccurateRcp && !RcpIsAccurate)
    return nullptr;

  Type *Ty = Den->getType();
  Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_rcp, Ty);
  if (const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num)) {
    if (FastUnsafeRcpLegal || Ty->isFloatTy() || Ty->isHalfTy()) {
    if (AllowInaccurateRcp || RcpIsAccurate) {
      if (CLHS->isExactlyValue(1.0)) {
        // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
        // the CI documentation has a worst case error of 1 ulp.
@@ -648,49 +647,63 @@ static Value *performRCPOpt(Value *Num, Value *Den, bool FastUnsafeRcpLegal,
    }
  }

  if (FastUnsafeRcpLegal) {
  if (AllowInaccurateRcp) {
    // Turn into multiply by the reciprocal.
    // x / y -> x * (1.0 / y)
    Value *Recip = Builder.CreateCall(Decl, { Den });
    return Builder.CreateFMul(Num, Recip, "", FPMath);
    return Builder.CreateFMul(Num, Recip);
  }
  return nullptr;
}

static bool shouldKeepFDivF32(Value *Num, bool FastUnsafeRcpLegal,
                              bool HasDenormals) {
  const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);
  if (!CNum)
    return HasDenormals;

  if (FastUnsafeRcpLegal)
    return true;
// optimize with fdiv.fast:
//
// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
//
// 1/x -> fdiv.fast(1,x)  when !fpmath >= 2.5ulp.
//
// NOTE: optimizeWithRcp should be tried first because rcp is the preference.
static Value *optimizeWithFDivFast(Value *Num, Value *Den, float ReqdAccuracy,
                                   bool HasDenormals, IRBuilder<> Builder,
                                   Module *Mod) {
  // fdiv.fast can achieve 2.5 ULP accuracy.
  if (ReqdAccuracy < 2.5f)
    return nullptr;

  bool IsOne = CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0);
  // Only have fdiv.fast for f32.
  Type *Ty = Den->getType();
  if (!Ty->isFloatTy())
    return nullptr;

  // Reciprocal f32 is handled separately without denormals.
  return HasDenormals ^ IsOne;
  bool NumIsOne = false;
  if (const ConstantFP *CNum = dyn_cast<ConstantFP>(Num)) {
    if (CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0))
      NumIsOne = true;
  }

  // fdiv does not support denormals. But 1.0/x is always fine to use it.
  if (HasDenormals && !NumIsOne)
    return nullptr;

  Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast);
  return Builder.CreateCall(Decl, { Num, Den });
}

// Optimizations is performed based on fpmath, fast math flags as wells as
// denormals to lower fdiv using either rcp or fdiv.fast.
// Optimizations is performed based on fpmath, fast math flags as well as
// denormals to optimize fdiv with either rcp or fdiv.fast.
//
// With rcp:
//   1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
//                 allowed with unsafe-fp-math or afn.
//
// FastUnsafeRcpLegal: We determine whether it is legal to use rcp based on
//                     unsafe-fp-math, fast math flags, denormals and fpmath
//                     accuracy request.
//   a/b -> a*rcp(b) when inaccurate rcp is allowed with unsafe-fp-math or afn.
//
// RCP Optimizations:
//   1/x -> rcp(x) when fast unsafe rcp is legal or fpmath >= 2.5ULP with
//                                                  denormals flushed.
//   a/b -> a*rcp(b) when fast unsafe rcp is legal.
// With fdiv.fast:
//   a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
//
// Use fdiv.fast:
//   a/b -> fdiv.fast(a, b) when RCP optimization is not performed and
//                          fpmath >= 2.5ULP with denormals flushed.
//   1/x -> fdiv.fast(1,x)  when !fpmath >= 2.5ulp.
//
//   1/x -> fdiv.fast(1,x)  when RCP optimization is not performed and
//                          fpmath >= 2.5ULP with denormals.
// NOTE: rcp is the preference in cases that both are legal.
bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {

  Type *Ty = FDiv.getType()->getScalarType();
@@ -700,19 +713,17 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
    return false;

  const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
  MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath);
  const bool NeedHighAccuracy = !FPMath || FPOp->getFPAccuracy() < 2.5f;
  const float ReqdAccuracy =  FPOp->getFPAccuracy();

  // Inaccurate rcp is allowed with unsafe-fp-math or afn.
  FastMathFlags FMF = FPOp->getFastMathFlags();
  // Determine whether it is ok to use rcp based on unsafe-fp-math,
  // fast math flags, denormals and accuracy request.
  const bool FastUnsafeRcpLegal = HasUnsafeFPMath || FMF.isFast() ||
          (FMF.allowReciprocal() && ((!HasFP32Denormals && !NeedHighAccuracy)
                                     || FMF.approxFunc()));
  const bool AllowInaccurateRcp = HasUnsafeFPMath || FMF.approxFunc();

  // Use fdiv.fast for only f32, fpmath >= 2.5ULP and rcp is not used.
  const bool UseFDivFast = Ty->isFloatTy() && !NeedHighAccuracy &&
                           !FastUnsafeRcpLegal;
  // rcp_f16 is accurate for !fpmath >= 1.0ulp.
  // rcp_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
  // rcp_f64 is never accurate.
  const bool RcpIsAccurate = (Ty->isHalfTy() && ReqdAccuracy >= 1.0f) ||
            (Ty->isFloatTy() && !HasFP32Denormals && ReqdAccuracy >= 1.0f);

  IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()));
  Builder.setFastMathFlags(FMF);
@@ -730,31 +741,24 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
    for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) {
      Value *NumEltI = Builder.CreateExtractElement(Num, I);
      Value *DenEltI = Builder.CreateExtractElement(Den, I);
      Value *NewElt = nullptr;
      if (UseFDivFast && !shouldKeepFDivF32(NumEltI, FastUnsafeRcpLegal,
                                           HasFP32Denormals)) {
        Function *Decl =
                 Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast);
        NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI }, "", FPMath);
      }
      if (!NewElt) // Try rcp.
        NewElt = performRCPOpt(NumEltI, DenEltI, FastUnsafeRcpLegal, Builder,
                               FPMath, Mod, HasFP32Denormals, NeedHighAccuracy);
      if (!NewElt)
        NewElt = Builder.CreateFDiv(NumEltI, DenEltI, "", FPMath);
      // Try rcp first.
      Value *NewElt = optimizeWithRcp(NumEltI, DenEltI, AllowInaccurateRcp,
                                      RcpIsAccurate, Builder, Mod);
      if (!NewElt) // Try fdiv.fast.
        NewElt = optimizeWithFDivFast(NumEltI, DenEltI, ReqdAccuracy,
                                      HasFP32Denormals, Builder, Mod);
      if (!NewElt) // Keep the original.
        NewElt = Builder.CreateFDiv(NumEltI, DenEltI);

      NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I);
    }
  } else { // Scalar.
    if (UseFDivFast && !shouldKeepFDivF32(Num, FastUnsafeRcpLegal,
                                          HasFP32Denormals)) {
      Function *Decl =
               Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast);
      NewFDiv = Builder.CreateCall(Decl, { Num, Den }, "", FPMath);
    }
    if (!NewFDiv) { // Try rcp.
      NewFDiv = performRCPOpt(Num, Den, FastUnsafeRcpLegal, Builder, FPMath,
                              Mod, HasFP32Denormals, NeedHighAccuracy);
  } else { // Scalar FDiv.
    // Try rcp first.
    NewFDiv = optimizeWithRcp(Num, Den, AllowInaccurateRcp, RcpIsAccurate,
                              Builder, Mod);
    if (!NewFDiv) { // Try fdiv.fast.
      NewFDiv = optimizeWithFDivFast(Num, Den, ReqdAccuracy, HasFP32Denormals,
                                     Builder, Mod);
    }
  }

+6 −13
Original line number Diff line number Diff line
@@ -7418,19 +7418,12 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
  EVT VT = Op.getValueType();
  const SDNodeFlags Flags = Op->getFlags();

  bool FastUnsafeRcpLegal = DAG.getTarget().Options.UnsafeFPMath ||
         (Flags.hasAllowReciprocal() &&
          ((VT == MVT::f32 && hasFP32Denormals(DAG.getMachineFunction())) ||
            VT == MVT::f16 ||
            Flags.hasApproximateFuncs()));

  // Do rcp optimization only when fast unsafe rcp is legal here.
  // NOTE: We already performed RCP optimization to insert intrinsics in
  // AMDGPUCodeGenPrepare. Ideally there should have no opportunity here to
  // rcp optimization.
  //   However, there are cases like FREM, which is expended into a sequence
  // of instructions including FDIV, which may expose new opportunities.
  if (!FastUnsafeRcpLegal)
  bool AllowInaccurateRcp = DAG.getTarget().Options.UnsafeFPMath ||
                            Flags.hasApproximateFuncs();

  // Without !fpmath accuracy information, we can't do more because we don't
  // know exactly whether rcp is accurate enough to meet !fpmath requirement.
  if (!AllowInaccurateRcp)
    return SDValue();

  if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
+86 −111

File changed.

Preview size limit exceeded, changes collapsed.

+45 −22
Original line number Diff line number Diff line
@@ -63,7 +63,7 @@ entry:
  %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext
  %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
  %b.val = load volatile half, half addrspace(1)* %gep.b
  %r.val = fdiv half 1.0, %b.val
  %r.val = fdiv half 1.0, %b.val, !fpmath !0
  store half %r.val, half addrspace(1)* %gep.r
  ret void
}
@@ -82,25 +82,46 @@ entry:
  %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
  %b.val = load volatile half, half addrspace(1)* %gep.b
  %b.abs = call half @llvm.fabs.f16(half %b.val)
  %r.val = fdiv half 1.0, %b.abs
  %r.val = fdiv half 1.0, %b.abs, !fpmath !0
  store half %r.val, half addrspace(1)* %gep.r
  ret void
}

; GCN-LABEL: {{^}}v_rcp_f16_arcp:
; We could not do 1/b -> rcp_f16(b) under !fpmath < 1ulp.

; GCN-LABEL: {{^}}reciprocal_f16_rounded:
; GFX8_9_10: {{flat|global}}_load_ushort [[VAL16:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}
; GFX8_9_10: v_cvt_f32_f16_e32 [[CVT_TO32:v[0-9]+]], [[VAL16]]
; GFX8_9_10: v_rcp_f32_e32 [[RCP32:v[0-9]+]], [[CVT_TO32]]
; GFX8_9_10: v_cvt_f16_f32_e32 [[CVT_BACK16:v[0-9]+]], [[RCP32]]
; GFX8_9_10: v_div_fixup_f16 [[RESULT:v[0-9]+]], [[CVT_BACK16]], [[VAL16]], 1.0
; GFX8_9_10: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @reciprocal_f16_rounded(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
entry:
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %tid.ext = sext i32 %tid to i64
  %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext
  %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
  %b.val = load volatile half, half addrspace(1)* %gep.b
  %r.val = fdiv half 1.0, %b.val
  store half %r.val, half addrspace(1)* %gep.r
  ret void
}

; GCN-LABEL: {{^}}v_rcp_f16_afn:
; GFX8_9_10: {{flat|global}}_load_ushort [[VAL:v[0-9]+]]
; GFX8_9_10-NOT: [[VAL]]
; GFX8_9_10: v_rcp_f16_e32 [[RESULT:v[0-9]+]], [[VAL]]
; GFX8_9_10-NOT: [[RESULT]]
; GFX8_9_10: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @v_rcp_f16_arcp(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
define amdgpu_kernel void @v_rcp_f16_afn(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
entry:
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %tid.ext = sext i32 %tid to i64
  %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext
  %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
  %b.val = load volatile half, half addrspace(1)* %gep.b
  %r.val = fdiv arcp half 1.0, %b.val
  %r.val = fdiv afn half 1.0, %b.val, !fpmath !0
  store half %r.val, half addrspace(1)* %gep.r
  ret void
}
@@ -118,7 +139,7 @@ entry:
  %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext
  %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
  %b.val = load volatile half, half addrspace(1)* %gep.b
  %r.val = fdiv half -1.0, %b.val
  %r.val = fdiv half -1.0, %b.val, !fpmath !0
  store half %r.val, half addrspace(1)* %gep.r
  ret void
}
@@ -137,7 +158,7 @@ entry:
  %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
  %b.val = load volatile half, half addrspace(1)* %gep.b
  %b.sqrt = call half @llvm.sqrt.f16(half %b.val)
  %r.val = fdiv half 1.0, %b.sqrt
  %r.val = fdiv half 1.0, %b.sqrt, !fpmath !0
  store half %r.val, half addrspace(1)* %gep.r
  ret void
}
@@ -157,12 +178,12 @@ entry:
  %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
  %b.val = load volatile half, half addrspace(1)* %gep.b
  %b.sqrt = call half @llvm.sqrt.f16(half %b.val)
  %r.val = fdiv half -1.0, %b.sqrt
  %r.val = fdiv half -1.0, %b.sqrt, !fpmath !0
  store half %r.val, half addrspace(1)* %gep.r
  ret void
}

; GCN-LABEL: {{^}}v_fdiv_f16_arcp:
; GCN-LABEL: {{^}}v_fdiv_f16_afn:
; GFX8_9_10: {{flat|global}}_load_ushort [[LHS:v[0-9]+]]
; GFX8_9_10: {{flat|global}}_load_ushort [[RHS:v[0-9]+]]

@@ -170,7 +191,7 @@ entry:
; GFX8_9_10: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[LHS]], [[RCP]]

; GFX8_9_10: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @v_fdiv_f16_arcp(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #0 {
define amdgpu_kernel void @v_fdiv_f16_afn(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #0 {
entry:
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %tid.ext = sext i32 %tid to i64
@@ -179,7 +200,7 @@ entry:
  %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
  %a.val = load volatile half, half addrspace(1)* %gep.a
  %b.val = load volatile half, half addrspace(1)* %gep.b
  %r.val = fdiv arcp half %a.val, %b.val
  %r.val = fdiv afn half %a.val, %b.val
  store half %r.val, half addrspace(1)* %gep.r
  ret void
}
@@ -206,38 +227,38 @@ entry:
  ret void
}

; FUNC-LABEL: {{^}}div_arcp_2_x_pat_f16:
; FUNC-LABEL: {{^}}div_afn_2_x_pat_f16:
; SI: v_mul_f32_e32 v{{[0-9]+}}, 0.5, v{{[0-9]+}}

; GFX8_9_10: v_mul_f16_e32 [[MUL:v[0-9]+]], 0.5, v{{[0-9]+}}
; GFX8_9_10: buffer_store_short [[MUL]]
define amdgpu_kernel void @div_arcp_2_x_pat_f16(half addrspace(1)* %out) #0 {
define amdgpu_kernel void @div_afn_2_x_pat_f16(half addrspace(1)* %out) #0 {
  %x = load half, half addrspace(1)* undef
  %rcp = fdiv arcp half %x, 2.0
  %rcp = fdiv afn half %x, 2.0
  store half %rcp, half addrspace(1)* %out, align 4
  ret void
}

; FUNC-LABEL: {{^}}div_arcp_k_x_pat_f16:
; SI: v_mul_f32_e32 v{{[0-9]+}}, 0x3dccc000, v{{[0-9]+}}
; FUNC-LABEL: {{^}}div_afn_k_x_pat_f16:
; SI: v_mul_f32_e32 v{{[0-9]+}}, 0x3dcccccd, v{{[0-9]+}}

; GFX8_9_10: v_mul_f16_e32 [[MUL:v[0-9]+]], 0x2e66, v{{[0-9]+}}
; GFX8_9_10: buffer_store_short [[MUL]]
define amdgpu_kernel void @div_arcp_k_x_pat_f16(half addrspace(1)* %out) #0 {
define amdgpu_kernel void @div_afn_k_x_pat_f16(half addrspace(1)* %out) #0 {
  %x = load half, half addrspace(1)* undef
  %rcp = fdiv arcp half %x, 10.0
  %rcp = fdiv afn half %x, 10.0
  store half %rcp, half addrspace(1)* %out, align 4
  ret void
}

; FUNC-LABEL: {{^}}div_arcp_neg_k_x_pat_f16:
; SI: v_mul_f32_e32 v{{[0-9]+}}, 0xbdccc000, v{{[0-9]+}}
; FUNC-LABEL: {{^}}div_afn_neg_k_x_pat_f16:
; SI: v_mul_f32_e32 v{{[0-9]+}}, 0xbdcccccd, v{{[0-9]+}}

; GFX8_9_10: v_mul_f16_e32 [[MUL:v[0-9]+]], 0xae66, v{{[0-9]+}}
; GFX8_9_10: buffer_store_short [[MUL]]
define amdgpu_kernel void @div_arcp_neg_k_x_pat_f16(half addrspace(1)* %out) #0 {
define amdgpu_kernel void @div_afn_neg_k_x_pat_f16(half addrspace(1)* %out) #0 {
  %x = load half, half addrspace(1)* undef
  %rcp = fdiv arcp half %x, -10.0
  %rcp = fdiv afn half %x, -10.0
  store half %rcp, half addrspace(1)* %out, align 4
  ret void
}
@@ -249,3 +270,5 @@ declare half @llvm.fabs.f16(half) #1
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
attributes #2 = { nounwind "unsafe-fp-math"="true" }

!0 = !{float 2.500000e+00}