Commit 90c31b0f authored by Craig Topper's avatar Craig Topper
Browse files

[X86] Custom lower ISD::FROUND with SSE4.1 to avoid a libcall.

ISD::FROUND is defined to round to nearest with ties rounding
away from 0. This mode isn't supported in hardware on X86.

But as long as we aren't compiling with trapping math, we can
emulate this with floor(X + copysign(nextafter(0.5, 0.0), X)).

We have to use nextafter to avoid some corner cases that adding
0.5 would have. For example, if X is nextafter(0.5, 0.0) it should
round to 0.0, but adding 0.5 would need one extra bit of mantissa
than can be stored so it rounds to 1.0. Adding nextafter(0.5, 0.0)
instead will just increase the exponent by 1 and leave the mantissa
as all 1s. This would be nextafter(1.0, 0.0) which will floor to 0.0.

Techically this requires -fno-trapping-math which isn't our default.
But if we care about exceptions we should be using constrained
intrinsics. Constrained intrinsics would use STRICT_FROUND which
won't go through this code.

Fixes PR42195.

Differential Revision: https://reviews.llvm.org/D73607
parent 4bc07c33
Loading
Loading
Loading
Loading
+32 −0
Original line number Diff line number Diff line
@@ -1078,6 +1078,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
      setOperationAction(ISD::STRICT_FRINT,      RoundedTy,  Legal);
      setOperationAction(ISD::FNEARBYINT,        RoundedTy,  Legal);
      setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy,  Legal);
      setOperationAction(ISD::FROUND,            RoundedTy,  Custom);
    }
    setOperationAction(ISD::SMAX,               MVT::v16i8, Legal);
@@ -1170,6 +1172,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
      setOperationAction(ISD::STRICT_FRINT,      VT, Legal);
      setOperationAction(ISD::FNEARBYINT,        VT, Legal);
      setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
      setOperationAction(ISD::FROUND,            VT, Custom);
      setOperationAction(ISD::FNEG,              VT, Custom);
      setOperationAction(ISD::FABS,              VT, Custom);
      setOperationAction(ISD::FCOPYSIGN,         VT, Custom);
@@ -1535,6 +1540,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
      setOperationAction(ISD::FNEARBYINT,        VT, Legal);
      setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
      setOperationAction(ISD::FROUND,            VT, Custom);
      setOperationAction(ISD::SELECT,           VT, Custom);
    }
@@ -20450,6 +20457,30 @@ SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {
  return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
}
/// ISD::FROUND is defined to round to nearest with ties rounding away from 0.
/// This mode isn't supported in hardware on X86. But as long as we aren't
/// compiling with trapping math, we can emulate this with
/// floor(X + copysign(nextafter(0.5, 0.0), X)).
static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) {
  SDValue N0 = Op.getOperand(0);
  SDLoc dl(Op);
  MVT VT = Op.getSimpleValueType();
  // N0 += copysign(nextafter(0.5, 0.0), N0)
  const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
  bool Ignored;
  APFloat Point5Pred = APFloat(0.5f);
  Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored);
  Point5Pred.next(/*nextDown*/true);
  SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT,
                              DAG.getConstantFP(Point5Pred, dl, VT), N0);
  N0 = DAG.getNode(ISD::FADD, dl, VT, N0, Adder);
  // Truncate the result to remove fraction.
  return DAG.getNode(ISD::FTRUNC, dl, VT, N0);
}
/// The only differences between FABS and FNEG are the mask and the logic op.
/// FNEG also has a folding opportunity for FNEG(FABS(x)).
static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
@@ -28623,6 +28654,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
  case ISD::STORE:              return LowerStore(Op, Subtarget, DAG);
  case ISD::FADD:
  case ISD::FSUB:               return lowerFaddFsub(Op, DAG);
  case ISD::FROUND:             return LowerFROUND(Op, DAG);
  case ISD::FABS:
  case ISD::FNEG:               return LowerFABSorFNEG(Op, DAG);
  case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
+34 −7
Original line number Diff line number Diff line
@@ -1067,13 +1067,25 @@ define double @nearbyint_v4f64(<4 x double> %x) nounwind {
define float @round_v4f32(<4 x float> %x) nounwind {
; X64-LABEL: round_v4f32:
; X64:       # %bb.0:
; X64-NEXT:    jmp roundf # TAILCALL
; X64-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
; X64-NEXT:    vandps %xmm1, %xmm0, %xmm1
; X64-NEXT:    vbroadcastss {{.*#+}} xmm2 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
; X64-NEXT:    vorps %xmm1, %xmm2, %xmm1
; X64-NEXT:    vaddss %xmm1, %xmm0, %xmm0
; X64-NEXT:    vroundss $11, %xmm0, %xmm0, %xmm0
; X64-NEXT:    retq
;
; X86-LABEL: round_v4f32:
; X86:       # %bb.0:
; X86-NEXT:    pushl %eax
; X86-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
; X86-NEXT:    vandps %xmm1, %xmm0, %xmm1
; X86-NEXT:    vbroadcastss {{.*#+}} xmm2 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
; X86-NEXT:    vorps %xmm1, %xmm2, %xmm1
; X86-NEXT:    vaddss %xmm1, %xmm0, %xmm0
; X86-NEXT:    vroundss $11, %xmm0, %xmm0, %xmm0
; X86-NEXT:    vmovss %xmm0, (%esp)
; X86-NEXT:    calll roundf
; X86-NEXT:    flds (%esp)
; X86-NEXT:    popl %eax
; X86-NEXT:    retl
  %v = call <4 x float> @llvm.round.v4f32(<4 x float> %x)
@@ -1084,17 +1096,32 @@ define float @round_v4f32(<4 x float> %x) nounwind {
define double @round_v4f64(<4 x double> %x) nounwind {
; X64-LABEL: round_v4f64:
; X64:       # %bb.0:
; X64-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
; X64-NEXT:    vandpd {{.*}}(%rip), %xmm0, %xmm1
; X64-NEXT:    vmovddup {{.*#+}} xmm2 = [4.9999999999999994E-1,4.9999999999999994E-1]
; X64-NEXT:    # xmm2 = mem[0,0]
; X64-NEXT:    vorpd %xmm1, %xmm2, %xmm1
; X64-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
; X64-NEXT:    vroundsd $11, %xmm0, %xmm0, %xmm0
; X64-NEXT:    vzeroupper
; X64-NEXT:    jmp round # TAILCALL
; X64-NEXT:    retq
;
; X86-LABEL: round_v4f64:
; X86:       # %bb.0:
; X86-NEXT:    pushl %ebp
; X86-NEXT:    movl %esp, %ebp
; X86-NEXT:    andl $-8, %esp
; X86-NEXT:    subl $8, %esp
; X86-NEXT:    vmovlps %xmm0, (%esp)
; X86-NEXT:    vandpd {{\.LCPI.*}}, %xmm0, %xmm1
; X86-NEXT:    vmovddup {{.*#+}} xmm2 = [4.9999999999999994E-1,4.9999999999999994E-1]
; X86-NEXT:    # xmm2 = mem[0,0]
; X86-NEXT:    vorpd %xmm1, %xmm2, %xmm1
; X86-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
; X86-NEXT:    vroundsd $11, %xmm0, %xmm0, %xmm0
; X86-NEXT:    vmovsd %xmm0, (%esp)
; X86-NEXT:    fldl (%esp)
; X86-NEXT:    movl %ebp, %esp
; X86-NEXT:    popl %ebp
; X86-NEXT:    vzeroupper
; X86-NEXT:    calll round
; X86-NEXT:    addl $8, %esp
; X86-NEXT:    retl
  %v = call <4 x double> @llvm.round.v4f64(<4 x double> %x)
  %r = extractelement <4 x double> %v, i32 0
+4 −10
Original line number Diff line number Diff line
@@ -386,16 +386,10 @@ define <2 x float> @rint_v2f32(<2 x float> %x) nounwind {
define <2 x float> @round_v2f32(<2 x float> %x) nounwind {
; CHECK-LABEL: round_v2f32:
; CHECK:       # %bb.0:
; CHECK-NEXT:    subq $40, %rsp
; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT:    callq roundf
; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
; CHECK-NEXT:    callq roundf
; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
; CHECK-NEXT:    addq $40, %rsp
; CHECK-NEXT:    vandps {{.*}}(%rip), %xmm0, %xmm1
; CHECK-NEXT:    vorps {{.*}}(%rip), %xmm1, %xmm1
; CHECK-NEXT:    vaddps %xmm1, %xmm0, %xmm0
; CHECK-NEXT:    vroundps $11, %xmm0, %xmm0
; CHECK-NEXT:    retq
  %r = call <2 x float> @llvm.round.v2f32(<2 x float> %x)
  ret <2 x float> %r
+0 −30
Original line number Diff line number Diff line
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mcpu=nehalem -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

declare void @use(<2 x double>)

; Function Attrs: nounwind uwtable
define void @test() {
; CHECK-LABEL: test:
; CHECK:       # %bb.0: # %entry
; CHECK-NEXT:    pushq %rax
; CHECK-NEXT:    .cfi_def_cfa_offset 16
; CHECK-NEXT:    callq round
; CHECK-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
; CHECK-NEXT:    callq use
; CHECK-NEXT:    popq %rax
; CHECK-NEXT:    .cfi_def_cfa_offset 8
; CHECK-NEXT:    retq
entry:
  %tmp = call <2 x double> @llvm.round.v2f64(<2 x double> undef)
  call void @use(<2 x double> %tmp)
  ret void
}

; Function Attrs: nounwind readonly
declare <2 x double> @llvm.round.v2f64(<2 x double>) #0

attributes #0 = { nounwind readonly }