Commit 3239b503 authored by Wang, Pengfei's avatar Wang, Pengfei
Browse files

[FPEnv] Add pragma FP_CONTRACT support under strict FP.

Summary: Support pragma FP_CONTRACT under strict FP.

Reviewers: craig.topper, andrew.w.kaylor, uweigand, RKSimon, LiuChen3

Subscribers: hiraditya, jdoerfert, cfe-commits, llvm-commits, LuoYuanke

Tags: #clang, #llvm

Differential Revision: https://reviews.llvm.org/D72820
parent 4c8817cd
......@@ -3365,7 +3365,7 @@ static Value *emitPointerArithmetic(CodeGenFunction &CGF,
// the add operand respectively. This allows fmuladd to represent a*b-c, or
// c-a*b. Patterns in LLVM should catch the negated forms and translate them to
// efficient operations.
static Value* buildFMulAdd(llvm::BinaryOperator *MulOp, Value *Addend,
static Value* buildFMulAdd(llvm::Instruction *MulOp, Value *Addend,
const CodeGenFunction &CGF, CGBuilderTy &Builder,
bool negMul, bool negAdd) {
assert(!(negMul && negAdd) && "Only one of negMul and negAdd should be set.");
......@@ -3377,12 +3377,23 @@ static Value* buildFMulAdd(llvm::BinaryOperator *MulOp, Value *Addend,
if (negAdd)
Addend = Builder.CreateFNeg(Addend, "neg");
Value *FMulAdd = Builder.CreateCall(
CGF.CGM.getIntrinsic(llvm::Intrinsic::fmuladd, Addend->getType()),
{MulOp0, MulOp1, Addend});
MulOp->eraseFromParent();
Value *FMulAdd = nullptr;
if (Builder.getIsFPConstrained()) {
assert(isa<llvm::ConstrainedFPIntrinsic>(MulOp) &&
"Only constrained operation should be created when Builder is in FP "
"constrained mode");
FMulAdd = Builder.CreateConstrainedFPCall(
CGF.CGM.getIntrinsic(llvm::Intrinsic::experimental_constrained_fmuladd,
Addend->getType()),
{MulOp0, MulOp1, Addend});
} else {
FMulAdd = Builder.CreateCall(
CGF.CGM.getIntrinsic(llvm::Intrinsic::fmuladd, Addend->getType()),
{MulOp0, MulOp1, Addend});
}
MulOp->eraseFromParent();
return FMulAdd;
return FMulAdd;
}
// Check whether it would be legal to emit an fmuladd intrinsic call to
......@@ -3417,6 +3428,19 @@ static Value* tryEmitFMulAdd(const BinOpInfo &op,
return buildFMulAdd(RHSBinOp, op.LHS, CGF, Builder, isSub, false);
}
if (auto *LHSBinOp = dyn_cast<llvm::CallBase>(op.LHS)) {
if (LHSBinOp->getIntrinsicID() ==
llvm::Intrinsic::experimental_constrained_fmul &&
LHSBinOp->use_empty())
return buildFMulAdd(LHSBinOp, op.RHS, CGF, Builder, false, isSub);
}
if (auto *RHSBinOp = dyn_cast<llvm::CallBase>(op.RHS)) {
if (RHSBinOp->getIntrinsicID() ==
llvm::Intrinsic::experimental_constrained_fmul &&
RHSBinOp->use_empty())
return buildFMulAdd(RHSBinOp, op.LHS, CGF, Builder, isSub, false);
}
return nullptr;
}
......
......@@ -148,3 +148,15 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
// CHECK: declare x86_fp80 @llvm.experimental.constrained.trunc.f80(x86_fp80, metadata)
};
#pragma STDC FP_CONTRACT ON
void bar(float f) {
f * f + f;
(double)f * f - f;
(long double)-f * f + f;
// CHECK: call float @llvm.experimental.constrained.fmuladd.f32
// CHECK: fneg
// CHECK: call double @llvm.experimental.constrained.fmuladd.f64
// CHECK: fneg
// CHECK: call x86_fp80 @llvm.experimental.constrained.fmuladd.f80
};
......@@ -16141,6 +16141,69 @@ if either operand is a SNAN. The signaling comparison operation
performed by '``llvm.experimental.constrained.fcmps``' will raise an
exception if either operand is a NAN (QNAN or SNAN).
 
'``llvm.experimental.constrained.fmuladd``' Intrinsic
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Syntax:
"""""""
::
declare <type>
@llvm.experimental.constrained.fmuladd(<type> <op1>, <type> <op2>,
<type> <op3>,
metadata <rounding mode>,
metadata <exception behavior>)
Overview:
"""""""""
The '``llvm.experimental.constrained.fmuladd``' intrinsic represents
multiply-add expressions that can be fused if the code generator determines
that (a) the target instruction set has support for a fused operation,
and (b) that the fused operation is more efficient than the equivalent,
separate pair of mul and add instructions.
Arguments:
""""""""""
The first three arguments to the '``llvm.experimental.constrained.fmuladd``'
intrinsic must be floating-point or vector of floating-point values.
All three arguments must have identical types.
The fourth and fifth arguments specifiy the rounding mode and exception behavior
as described above.
Semantics:
""""""""""
The expression:
::
%0 = call float @llvm.experimental.constrained.fmuladd.f32(%a, %b, %c,
metadata <rounding mode>,
metadata <exception behavior>)
is equivalent to the expression:
::
%0 = call float @llvm.experimental.constrained.fmul.f32(%a, %b,
metadata <rounding mode>,
metadata <exception behavior>)
%1 = call float @llvm.experimental.constrained.fadd.f32(%0, %c,
metadata <rounding mode>,
metadata <exception behavior>)
except that it is unspecified whether rounding will be performed between the
multiplication and addition steps. Fusion is not guaranteed, even if the target
platform supports it.
If a fused multiply-add is required, the corresponding
:ref:`llvm.experimental.constrained.fma <int_fma>` intrinsic function should be
used instead.
This never sets errno, just as '``llvm.experimental.constrained.fma.*``'.
Constrained libm-equivalent Intrinsics
--------------------------------------
 
......
......@@ -1288,6 +1288,9 @@ public:
case Intrinsic::fmuladd:
ISDs.push_back(ISD::FMA);
break;
case Intrinsic::experimental_constrained_fmuladd:
ISDs.push_back(ISD::STRICT_FMA);
break;
// FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free.
case Intrinsic::lifetime_start:
case Intrinsic::lifetime_end:
......@@ -1511,6 +1514,12 @@ public:
if (IID == Intrinsic::fmuladd)
return ConcreteTTI->getArithmeticInstrCost(BinaryOperator::FMul, RetTy) +
ConcreteTTI->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy);
if (IID == Intrinsic::experimental_constrained_fmuladd)
return ConcreteTTI->getIntrinsicCost(
Intrinsic::experimental_constrained_fmul, RetTy, Tys,
nullptr) +
ConcreteTTI->getIntrinsicCost(
Intrinsic::experimental_constrained_fadd, RetTy, Tys, nullptr);
// Else, assume that we need to scalarize this intrinsic. For math builtins
// this will emit a costly libcall, adding call overhead and spills. Make it
......
......@@ -95,6 +95,10 @@ DAG_FUNCTION(sin, 1, 1, experimental_constrained_sin, FSIN)
DAG_FUNCTION(sqrt, 1, 1, experimental_constrained_sqrt, FSQRT)
DAG_FUNCTION(trunc, 1, 0, experimental_constrained_trunc, FTRUNC)
// This is definition for fmuladd intrinsic function, that is converted into
// constrained FMA or FMUL + FADD intrinsics.
FUNCTION(fmuladd, 3, 1, experimental_constrained_fmuladd)
#undef INSTRUCTION
#undef FUNCTION
#undef CMP_INSTRUCTION
......
......@@ -640,6 +640,13 @@ let IntrProperties = [IntrInaccessibleMemOnly, IntrWillReturn] in {
llvm_metadata_ty,
llvm_metadata_ty ]>;
def int_experimental_constrained_fmuladd : Intrinsic<[ llvm_anyfloat_ty ],
[ LLVMMatchType<0>,
LLVMMatchType<0>,
LLVMMatchType<0>,
llvm_metadata_ty,
llvm_metadata_ty ]>;
def int_experimental_constrained_fptosi : Intrinsic<[ llvm_anyint_ty ],
[ llvm_anyfloat_ty,
llvm_metadata_ty ]>;
......
......@@ -7021,6 +7021,35 @@ void SelectionDAGBuilder::visitConstrainedFPIntrinsic(
Opers.push_back(getValue(FPI.getArgOperand(1)));
}
auto pushOutChain = [this](SDValue Result, fp::ExceptionBehavior EB) {
assert(Result.getNode()->getNumValues() == 2);
// Push node to the appropriate list so that future instructions can be
// chained up correctly.
SDValue OutChain = Result.getValue(1);
switch (EB) {
case fp::ExceptionBehavior::ebIgnore:
// The only reason why ebIgnore nodes still need to be chained is that
// they might depend on the current rounding mode, and therefore must
// not be moved across instruction that may change that mode.
LLVM_FALLTHROUGH;
case fp::ExceptionBehavior::ebMayTrap:
// These must not be moved across calls or instructions that may change
// floating-point exception masks.
PendingConstrainedFP.push_back(OutChain);
break;
case fp::ExceptionBehavior::ebStrict:
// These must not be moved across calls or instructions that may change
// floating-point exception masks or read floating-point exception flags.
// In addition, they cannot be optimized out even if unused.
PendingConstrainedFPStrict.push_back(OutChain);
break;
}
};
SDVTList VTs = DAG.getVTList(ValueVTs);
fp::ExceptionBehavior EB = FPI.getExceptionBehavior().getValue();
unsigned Opcode;
switch (FPI.getIntrinsicID()) {
default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
......@@ -7029,6 +7058,23 @@ void SelectionDAGBuilder::visitConstrainedFPIntrinsic(
Opcode = ISD::STRICT_##DAGN; \
break;
#include "llvm/IR/ConstrainedOps.def"
case Intrinsic::experimental_constrained_fmuladd: {
Opcode = ISD::STRICT_FMA;
// Break fmuladd into fmul and fadd.
if (TM.Options.AllowFPOpFusion == FPOpFusion::Strict ||
!TLI.isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(),
ValueVTs[0])) {
Opers.pop_back();
SDValue Mul = DAG.getNode(ISD::STRICT_FMUL, sdl, VTs, Opers);
pushOutChain(Mul, EB);
Opcode = ISD::STRICT_FADD;
Opers.clear();
Opers.push_back(Mul.getValue(1));
Opers.push_back(Mul.getValue(0));
Opers.push_back(getValue(FPI.getArgOperand(2)));
}
break;
}
}
// A few strict DAG nodes carry additional operands that are not
......@@ -7047,32 +7093,8 @@ void SelectionDAGBuilder::visitConstrainedFPIntrinsic(
}
}
SDVTList VTs = DAG.getVTList(ValueVTs);
SDValue Result = DAG.getNode(Opcode, sdl, VTs, Opers);
assert(Result.getNode()->getNumValues() == 2);
// Push node to the appropriate list so that future instructions can be
// chained up correctly.
SDValue OutChain = Result.getValue(1);
switch (FPI.getExceptionBehavior().getValue()) {
case fp::ExceptionBehavior::ebIgnore:
// The only reason why ebIgnore nodes still need to be chained is that
// they might depend on the current rounding mode, and therefore must
// not be moved across instruction that may change that mode.
LLVM_FALLTHROUGH;
case fp::ExceptionBehavior::ebMayTrap:
// These must not be moved across calls or instructions that may change
// floating-point exception masks.
PendingConstrainedFP.push_back(OutChain);
break;
case fp::ExceptionBehavior::ebStrict:
// These must not be moved across calls or instructions that may change
// floating-point exception masks or read floating-point exception flags.
// In addition, they cannot be optimized out even if unused.
PendingConstrainedFPStrict.push_back(OutChain);
break;
}
pushOutChain(Result, EB);
SDValue FPResult = Result.getValue(0);
setValue(&FPI, FPResult);
......
......@@ -322,6 +322,128 @@ entry:
ret double %result
}
; Verify constrained fmul and fadd aren't fused.
define float @f11(float %0, float %1, float %2) #0 {
; NOFMA-LABEL: f11:
; NOFMA: # %bb.0: # %entry
; NOFMA-NEXT: mulss %xmm1, %xmm0
; NOFMA-NEXT: addss %xmm2, %xmm0
; NOFMA-NEXT: retq
;
; FMA-LABEL: f11:
; FMA: # %bb.0: # %entry
; FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0
; FMA-NEXT: vaddss %xmm2, %xmm0, %xmm0
; FMA-NEXT: retq
;
; FMA4-LABEL: f11:
; FMA4: # %bb.0: # %entry
; FMA4-NEXT: vmulss %xmm1, %xmm0, %xmm0
; FMA4-NEXT: vaddss %xmm2, %xmm0, %xmm0
; FMA4-NEXT: retq
entry:
%3 = call float @llvm.experimental.constrained.fmul.f32(float %0, float %1,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
%4 = call float @llvm.experimental.constrained.fadd.f32(float %3, float %2,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret float %4
}
; Verify constrained fmul and fadd aren't fused.
define double @f12(double %0, double %1, double %2) #0 {
; NOFMA-LABEL: f12:
; NOFMA: # %bb.0: # %entry
; NOFMA-NEXT: mulsd %xmm1, %xmm0
; NOFMA-NEXT: addsd %xmm2, %xmm0
; NOFMA-NEXT: retq
;
; FMA-LABEL: f12:
; FMA: # %bb.0: # %entry
; FMA-NEXT: vmulsd %xmm1, %xmm0, %xmm0
; FMA-NEXT: vaddsd %xmm2, %xmm0, %xmm0
; FMA-NEXT: retq
;
; FMA4-LABEL: f12:
; FMA4: # %bb.0: # %entry
; FMA4-NEXT: vmulsd %xmm1, %xmm0, %xmm0
; FMA4-NEXT: vaddsd %xmm2, %xmm0, %xmm0
; FMA4-NEXT: retq
entry:
%3 = call double @llvm.experimental.constrained.fmul.f64(double %0, double %1,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
%4 = call double @llvm.experimental.constrained.fadd.f64(double %3, double %2,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret double %4
}
; Verify that fmuladd(3.5) isn't simplified when the rounding mode is
; unknown.
define float @f15() #0 {
; NOFMA-LABEL: f15:
; NOFMA: # %bb.0: # %entry
; NOFMA-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; NOFMA-NEXT: movaps %xmm1, %xmm0
; NOFMA-NEXT: mulss %xmm1, %xmm0
; NOFMA-NEXT: addss %xmm1, %xmm0
; NOFMA-NEXT: retq
;
; FMA-LABEL: f15:
; FMA: # %bb.0: # %entry
; FMA-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; FMA-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm0 * xmm0) + xmm0
; FMA-NEXT: retq
;
; FMA4-LABEL: f15:
; FMA4: # %bb.0: # %entry
; FMA4-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; FMA4-NEXT: vfmaddss %xmm0, %xmm0, %xmm0, %xmm0
; FMA4-NEXT: retq
entry:
%result = call float @llvm.experimental.constrained.fmuladd.f32(
float 3.5,
float 3.5,
float 3.5,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret float %result
}
; Verify that fmuladd(42.1) isn't simplified when the rounding mode is
; unknown.
define double @f16() #0 {
; NOFMA-LABEL: f16:
; NOFMA: # %bb.0: # %entry
; NOFMA-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; NOFMA-NEXT: movapd %xmm1, %xmm0
; NOFMA-NEXT: mulsd %xmm1, %xmm0
; NOFMA-NEXT: addsd %xmm1, %xmm0
; NOFMA-NEXT: retq
;
; FMA-LABEL: f16:
; FMA: # %bb.0: # %entry
; FMA-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; FMA-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm0 * xmm0) + xmm0
; FMA-NEXT: retq
;
; FMA4-LABEL: f16:
; FMA4: # %bb.0: # %entry
; FMA4-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; FMA4-NEXT: vfmaddsd %xmm0, %xmm0, %xmm0, %xmm0
; FMA4-NEXT: retq
entry:
%result = call double @llvm.experimental.constrained.fmuladd.f64(
double 42.1,
double 42.1,
double 42.1,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret double %result
}
; Verify that fma(3.5) isn't simplified when the rounding mode is
; unknown.
define float @f17() #0 {
......@@ -954,7 +1076,13 @@ entry:
attributes #0 = { strictfp }
declare float @llvm.experimental.constrained.fmul.f32(float, float, metadata, metadata)
declare float @llvm.experimental.constrained.fadd.f32(float, float, metadata, metadata)
declare double @llvm.experimental.constrained.fmul.f64(double, double, metadata, metadata)
declare double @llvm.experimental.constrained.fadd.f64(double, double, metadata, metadata)
declare float @llvm.experimental.constrained.fma.f32(float, float, float, metadata, metadata)
declare double @llvm.experimental.constrained.fma.f64(double, double, double, metadata, metadata)
declare <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float>, <4 x float>, <4 x float>, metadata, metadata)
declare <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double>, <2 x double>, <2 x double>, metadata, metadata)
declare float @llvm.experimental.constrained.fmuladd.f32(float, float, float, metadata, metadata)
declare double @llvm.experimental.constrained.fmuladd.f64(double, double, double, metadata, metadata)
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment