Commit 40cd26c7 authored by Reid Kleckner's avatar Reid Kleckner
Browse files

[Win64] Handle FP arguments more gracefully under -mno-sse

Pass small FP values in GPRs or stack memory according the the normal
convention. This is what gcc -mno-sse does on Win64.

I adjusted the conditions under which we emit an error to check if the
argument or return value would be passed in an XMM register when SSE is
disabled. This has a side effect of no longer emitting an error for FP
arguments marked 'inreg' when targetting x86 with SSE disabled. Our
calling convention logic was already assigning it to FP0/FP1, and then
we emitted this error. That seems unnecessary, we can ignore 'inreg' and
compile it without SSE.

Reviewers: jyknight, aemerson

Differential Revision: https://reviews.llvm.org/D70465
parent 65c8abb1
Loading
Loading
Loading
Loading
+14 −6
Original line number Diff line number Diff line
@@ -346,6 +346,10 @@ def RetCC_X86_Win64_C : CallingConv<[
  // The X86-Win64 calling convention always returns __m64 values in RAX.
  CCIfType<[x86mmx], CCBitConvertToType<i64>>,

  // GCC returns FP values in RAX on Win64.
  CCIfType<[f32], CCIfNotSubtarget<"hasSSE1()", CCBitConvertToType<i32>>>,
  CCIfType<[f64], CCIfNotSubtarget<"hasSSE1()", CCBitConvertToType<i64>>>,

  // Otherwise, everything is the same as 'normal' X86-64 C CC.
  CCDelegateTo<RetCC_X86_64_C>
]>;
@@ -613,7 +617,6 @@ def CC_X86_Win64_C : CallingConv<[
  // 128 bit vectors are passed by pointer
  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCPassIndirect<i64>>,


  // 256 bit vectors are passed by pointer
  CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], CCPassIndirect<i64>>,

@@ -626,6 +629,16 @@ def CC_X86_Win64_C : CallingConv<[
  // The first 4 MMX vector arguments are passed in GPRs.
  CCIfType<[x86mmx], CCBitConvertToType<i64>>,

  // If SSE was disabled, pass FP values smaller than 64-bits as integers in
  // GPRs or on the stack.
  CCIfType<[f32], CCIfNotSubtarget<"hasSSE1()", CCBitConvertToType<i32>>>,
  CCIfType<[f64], CCIfNotSubtarget<"hasSSE1()", CCBitConvertToType<i64>>>,

  // The first 4 FP/Vector arguments are passed in XMM registers.
  CCIfType<[f32, f64],
           CCAssignToRegWithShadow<[XMM0, XMM1, XMM2, XMM3],
                                   [RCX , RDX , R8  , R9  ]>>,

  // The first 4 integer arguments are passed in integer registers.
  CCIfType<[i8 ], CCAssignToRegWithShadow<[CL  , DL  , R8B , R9B ],
                                          [XMM0, XMM1, XMM2, XMM3]>>,
@@ -643,11 +656,6 @@ def CC_X86_Win64_C : CallingConv<[
  CCIfType<[i64], CCAssignToRegWithShadow<[RCX , RDX , R8  , R9  ],
                                          [XMM0, XMM1, XMM2, XMM3]>>,

  // The first 4 FP/Vector arguments are passed in XMM registers.
  CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
           CCAssignToRegWithShadow<[XMM0, XMM1, XMM2, XMM3],
                                   [RCX , RDX , R8  , R9  ]>>,

  // Integer/FP values get stored in stack slots that are 8 bytes in size and
  // 8-byte aligned if there are no more registers to hold them.
  CCIfType<[i8, i16, i32, i64, f32, f64], CCAssignToStack<8, 8>>
+17 −16
Original line number Diff line number Diff line
@@ -2693,18 +2693,16 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
    assert(VA.getLocInfo() != CCValAssign::FPExt &&
           "Unexpected FP-extend for return value.");
    // If this is x86-64, and we disabled SSE, we can't return FP values,
    // or SSE or MMX vectors.
    if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
         VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
        (Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
    // Report an error if we have attempted to return a value via an XMM
    // register and SSE was disabled.
    if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
      errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
      VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
    } else if (ValVT == MVT::f64 &&
               (Subtarget.is64Bit() && !Subtarget.hasSSE2())) {
      // Likewise we can't return F64 values with SSE1 only.  gcc does so, but
      // llvm-gcc has never done it right and no one has noticed, so this
      // should be OK for now.
    } else if (!Subtarget.hasSSE2() &&
               X86::FR64XRegClass.contains(VA.getLocReg()) &&
               ValVT == MVT::f64) {
      // When returning a double via an XMM register, report an error if SSE2 is
      // not enabled.
      errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
      VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
    }
@@ -2999,7 +2997,6 @@ SDValue X86TargetLowering::LowerCallResult(
  const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
  // Assign locations to each value returned by this call.
  SmallVector<CCValAssign, 16> RVLocs;
  bool Is64Bit = Subtarget.is64Bit();
  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
                 *DAG.getContext());
  CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
@@ -3018,16 +3015,17 @@ SDValue X86TargetLowering::LowerCallResult(
        RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
    }
    // If this is x86-64, and we disabled SSE, we can't return FP values
    if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&
        ((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) {
    // Report an error if there was an attempt to return FP values via XMM
    // registers.
    if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
      errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
      if (VA.getLocReg() == X86::XMM1)
        VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
      else
        VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
    } else if (CopyVT == MVT::f64 &&
               (Is64Bit && !Subtarget.hasSSE2())) {
    } else if (!Subtarget.hasSSE2() &&
               X86::FR64XRegClass.contains(VA.getLocReg()) &&
               CopyVT == MVT::f64) {
      errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
      if (VA.getLocReg() == X86::XMM1)
        VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
@@ -3074,6 +3072,9 @@ SDValue X86TargetLowering::LowerCallResult(
        Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
    }
    if (VA.getLocInfo() == CCValAssign::BCvt)
      Val = DAG.getBitcast(VA.getValVT(), Val);
    InVals.push_back(Val);
  }
+129 −0
Original line number Diff line number Diff line
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=x86_64-windows-msvc < %s -mattr=-sse | FileCheck %s
; RUN: llc -mtriple=x86_64-windows-gnu < %s -mattr=-sse | FileCheck %s

define void @recv_double(double %v, double* %p) {
; CHECK-LABEL: recv_double:
; CHECK:       # %bb.0:
; CHECK-NEXT:    movq %rcx, (%rdx)
; CHECK-NEXT:    retq
  store double %v, double* %p
  ret void
}

define void @recv_float(float %v, float* %p) {
; CHECK-LABEL: recv_float:
; CHECK:       # %bb.0:
; CHECK-NEXT:    movl %ecx, (%rdx)
; CHECK-NEXT:    retq
  store float %v, float* %p
  ret void
}

define dso_local double @ret_double(double* %p) {
; CHECK-LABEL: ret_double:
; CHECK:       # %bb.0: # %entry
; CHECK-NEXT:    movq (%rcx), %rax
; CHECK-NEXT:    retq
entry:
  %v = load double, double* %p
  ret double %v
}

define dso_local float @ret_float(float* %p) {
; CHECK-LABEL: ret_float:
; CHECK:       # %bb.0: # %entry
; CHECK-NEXT:    movl (%rcx), %eax
; CHECK-NEXT:    retq
entry:
  %v = load float, float* %p
  ret float %v
}

declare void @take_double(double)
declare void @take_float(float)

define void @pass_double(double* %p) {
; CHECK-LABEL: pass_double:
; CHECK:       # %bb.0:
; CHECK-NEXT:    subq $40, %rsp
; CHECK-NEXT:    .seh_stackalloc 40
; CHECK-NEXT:    .seh_endprologue
; CHECK-NEXT:    movq (%rcx), %rcx
; CHECK-NEXT:    callq take_double
; CHECK-NEXT:    nop
; CHECK-NEXT:    addq $40, %rsp
; CHECK-NEXT:    retq
; CHECK-NEXT:    .seh_handlerdata
; CHECK-NEXT:    .text
; CHECK-NEXT:    .seh_endproc
  %v = load double, double* %p
  call void @take_double(double %v)
  ret void
}

define void @pass_float(float* %p) {
; CHECK-LABEL: pass_float:
; CHECK:       # %bb.0:
; CHECK-NEXT:    subq $40, %rsp
; CHECK-NEXT:    .seh_stackalloc 40
; CHECK-NEXT:    .seh_endprologue
; CHECK-NEXT:    movl (%rcx), %ecx
; CHECK-NEXT:    callq take_float
; CHECK-NEXT:    nop
; CHECK-NEXT:    addq $40, %rsp
; CHECK-NEXT:    retq
; CHECK-NEXT:    .seh_handlerdata
; CHECK-NEXT:    .text
; CHECK-NEXT:    .seh_endproc
  %v = load float, float* %p
  call void @take_float(float %v)
  ret void
}

declare double @produce_double()
declare float @produce_float()

define void @call_double(double* %p) {
; CHECK-LABEL: call_double:
; CHECK:       # %bb.0:
; CHECK-NEXT:    pushq %rsi
; CHECK-NEXT:    .seh_pushreg %rsi
; CHECK-NEXT:    subq $32, %rsp
; CHECK-NEXT:    .seh_stackalloc 32
; CHECK-NEXT:    .seh_endprologue
; CHECK-NEXT:    movq %rcx, %rsi
; CHECK-NEXT:    callq produce_double
; CHECK-NEXT:    movq %rax, (%rsi)
; CHECK-NEXT:    addq $32, %rsp
; CHECK-NEXT:    popq %rsi
; CHECK-NEXT:    retq
; CHECK-NEXT:    .seh_handlerdata
; CHECK-NEXT:    .text
; CHECK-NEXT:    .seh_endproc
  %v = call double @produce_double()
  store double %v, double* %p
  ret void
}

define void @call_float(float* %p) {
; CHECK-LABEL: call_float:
; CHECK:       # %bb.0:
; CHECK-NEXT:    pushq %rsi
; CHECK-NEXT:    .seh_pushreg %rsi
; CHECK-NEXT:    subq $32, %rsp
; CHECK-NEXT:    .seh_stackalloc 32
; CHECK-NEXT:    .seh_endprologue
; CHECK-NEXT:    movq %rcx, %rsi
; CHECK-NEXT:    callq produce_float
; CHECK-NEXT:    movl %eax, (%rsi)
; CHECK-NEXT:    addq $32, %rsp
; CHECK-NEXT:    popq %rsi
; CHECK-NEXT:    retq
; CHECK-NEXT:    .seh_handlerdata
; CHECK-NEXT:    .text
; CHECK-NEXT:    .seh_endproc
  %v = call float @produce_float()
  store float %v, float* %p
  ret void
}
+76 −0
Original line number Diff line number Diff line
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mcpu=i686 -mattr=+sse | FileCheck %s
; RUN: llc < %s -mcpu=i686 -mattr=-sse 2>&1 | FileCheck --check-prefix NOSSE %s

target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
target triple = "i386-unknown-linux-gnu"
@f = external global float
@d = external global double

define void @test() nounwind {
; CHECK-LABEL: test:
; CHECK:       # %bb.0: # %entry
; CHECK-NEXT:    subl $12, %esp
; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT:    movss %xmm0, (%esp)
; CHECK-NEXT:    calll foo1
; CHECK-NEXT:    fstps f
; CHECK-NEXT:    fldl d
; CHECK-NEXT:    fstpl (%esp)
; CHECK-NEXT:    calll foo2
; CHECK-NEXT:    fstpl d
; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT:    movss %xmm0, (%esp)
; CHECK-NEXT:    calll foo3
; CHECK-NEXT:    fstps f
; CHECK-NEXT:    fldl d
; CHECK-NEXT:    fstpl (%esp)
; CHECK-NEXT:    calll foo4
; CHECK-NEXT:    fstpl d
; CHECK-NEXT:    addl $12, %esp
; CHECK-NEXT:    retl
;
; NOSSE-LABEL: test:
; NOSSE:       # %bb.0: # %entry
; NOSSE-NEXT:    subl $12, %esp
; NOSSE-NEXT:    flds f
; NOSSE-NEXT:    fstps (%esp)
; NOSSE-NEXT:    calll foo1
; NOSSE-NEXT:    fstps f
; NOSSE-NEXT:    fldl d
; NOSSE-NEXT:    fstpl (%esp)
; NOSSE-NEXT:    calll foo2
; NOSSE-NEXT:    fstpl d
; NOSSE-NEXT:    flds f
; NOSSE-NEXT:    fstps (%esp)
; NOSSE-NEXT:    calll foo3
; NOSSE-NEXT:    fstps f
; NOSSE-NEXT:    fldl d
; NOSSE-NEXT:    fstpl (%esp)
; NOSSE-NEXT:    calll foo4
; NOSSE-NEXT:    fstpl d
; NOSSE-NEXT:    addl $12, %esp
; NOSSE-NEXT:    retl
entry:
  %0 = load float, float* @f, align 4
  %1 = tail call inreg float @foo1(float inreg %0) nounwind
  store float %1, float* @f, align 4
  %2 = load double, double* @d, align 8
  %3 = tail call inreg double @foo2(double inreg %2) nounwind
  store double %3, double* @d, align 8
  %4 = load float, float* @f, align 4
  %5 = tail call inreg float @foo3(float inreg %4) nounwind
  store float %5, float* @f, align 4
  %6 = load double, double* @d, align 8
  %7 = tail call inreg double @foo4(double inreg %6) nounwind
  store double %7, double* @d, align 8
  ret void
}

declare inreg float @foo1(float inreg)

declare inreg double @foo2(double inreg)

declare inreg float @foo3(float inreg)

declare inreg double @foo4(double inreg)
+0 −36
Original line number Diff line number Diff line
; RUN: not llc < %s -mcpu=i686 -mattr=-sse 2>&1 | FileCheck --check-prefix NOSSE %s
; RUN: llc < %s -mcpu=i686 -mattr=+sse | FileCheck %s

; NOSSE: {{SSE register return with SSE disabled}}

; CHECK: xmm

target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
target triple = "i386-unknown-linux-gnu"
@f = external global float		; <float*> [#uses=4]
@d = external global double		; <double*> [#uses=4]

define void @test() nounwind {
entry:
	%0 = load float, float* @f, align 4		; <float> [#uses=1]
	%1 = tail call inreg float @foo1(float inreg %0) nounwind		; <float> [#uses=1]
	store float %1, float* @f, align 4
	%2 = load double, double* @d, align 8		; <double> [#uses=1]
	%3 = tail call inreg double @foo2(double inreg %2) nounwind		; <double> [#uses=1]
	store double %3, double* @d, align 8
	%4 = load float, float* @f, align 4		; <float> [#uses=1]
	%5 = tail call inreg float @foo3(float inreg %4) nounwind		; <float> [#uses=1]
	store float %5, float* @f, align 4
	%6 = load double, double* @d, align 8		; <double> [#uses=1]
	%7 = tail call inreg double @foo4(double inreg %6) nounwind		; <double> [#uses=1]
	store double %7, double* @d, align 8
	ret void
}

declare inreg float @foo1(float inreg)

declare inreg double @foo2(double inreg)

declare inreg float @foo3(float inreg)

declare inreg double @foo4(double inreg)