Commit c089f028 authored by Reid Kleckner's avatar Reid Kleckner
Browse files

[X86] Don't setup and teardown memory for a musttail call

Summary:
musttail calls should not require allocating extra stack for arguments.
Updates to arguments passed in memory should happen in place before the
epilogue.

This bug was mostly a missed optimization, unless inalloca was used and
store to push conversion fired.

If a reserved call frame was used for an inalloca musttail call, the
call setup and teardown instructions would be deleted, and SP
adjustments would be inserted in the prologue and epilogue. You can see
these are removed from several test cases in this change.

In the case where the stack frame was not reserved, i.e. call frame
optimization fires and turns argument stores into pushes, then the
imbalanced call frame setup instructions created for inalloca calls
become a problem. They remain in the instruction stream, resulting in a
call setup that allocates zero bytes (expected for inalloca), and a call
teardown that deallocates the inalloca pack. This deallocation was
unbalanced, leading to subsequent crashes.

Reviewers: hans

Subscribers: hiraditya, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D71097
parent a7bdab2e
Loading
Loading
Loading
Loading
+2 −2
Original line number Diff line number Diff line
@@ -3808,7 +3808,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                         "the only memory argument");
  }
  if (!IsSibcall)
  if (!IsSibcall && !IsMustTail)
    Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
                                 NumBytes - NumBytesToPush, dl);
@@ -4093,7 +4093,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
  SmallVector<SDValue, 8> Ops;
  if (!IsSibcall && isTailCall) {
  if (!IsSibcall && isTailCall && !IsMustTail) {
    Chain = DAG.getCALLSEQ_END(Chain,
                               DAG.getIntPtrConstant(NumBytesToPop, dl, true),
                               DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
+1 −3
Original line number Diff line number Diff line
@@ -171,8 +171,7 @@ entry:

  ; X64-LABEL: func_cf_tail
  ; X64:       leaq	target_func(%rip), %rax
  ; X64:       movq __guard_dispatch_icall_fptr(%rip), %rcx
  ; X64:       rex64 jmpq *%rcx            # TAILCALL
  ; X64:       rex64 jmpq *__guard_dispatch_icall_fptr(%rip)         # TAILCALL
  ; X64-NOT:   callq
}

@@ -205,7 +204,6 @@ entry:
  ; X64:            movq (%rcx), %rax
  ; X64-NEXT:       movq 8(%rax), %rax
  ; X64-NEXT:       movq __guard_dispatch_icall_fptr(%rip), %rdx
  ; X64-NEXT:       addq $40, %rsp
  ; X64-NEXT:       rex64 jmpq *%rdx            # TAILCALL
  ; X64-NOT:   callq
}
+38 −0
Original line number Diff line number Diff line
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc %s -o - | FileCheck %s

; Previously, we would accidentally leave behind SP adjustments to setup a call
; frame for the musttail call target, and SP adjustments would end up
; unbalanced. Reported as https://crbug.com/1026882.

target datalayout = "e-m:x-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:32-n8:16:32-a:0:32-S32"
target triple = "i386-pc-windows-msvc19.16.0"

; 20 bytes of memory.
%struct.Args = type { i32, i32, i32, i32, i32 }

declare dso_local x86_thiscallcc void @methodWithVtorDisp(i8* nocapture readonly, <{ %struct.Args }>* inalloca)

define dso_local x86_thiscallcc void @methodWithVtorDisp_thunk(i8* %0, <{ %struct.Args }>* inalloca %1) #0 {
; CHECK-LABEL: methodWithVtorDisp_thunk:
; CHECK:       # %bb.0:
; CHECK-NEXT:    pushl %esi
; CHECK-NEXT:    movl %ecx, %esi
; CHECK-NEXT:    subl -4(%ecx), %esi
; CHECK-NEXT:    pushl {{[0-9]+}}(%esp)
; CHECK-NEXT:    pushl $_methodWithVtorDisp_thunk
; CHECK-NEXT:    calll ___cyg_profile_func_exit
; CHECK-NEXT:    addl $8, %esp
; CHECK-NEXT:    movl %esi, %ecx
; CHECK-NEXT:    popl %esi
; CHECK-NEXT:    jmp _methodWithVtorDisp # TAILCALL
  %3 = getelementptr inbounds i8, i8* %0, i32 -4
  %4 = bitcast i8* %3 to i32*
  %5 = load i32, i32* %4, align 4
  %6 = sub i32 0, %5
  %7 = getelementptr i8, i8* %0, i32 %6
  musttail call x86_thiscallcc void @methodWithVtorDisp(i8* %7, <{ %struct.Args }>* inalloca nonnull %1)
  ret void
}

attributes #0 = { nounwind optsize "instrument-function-exit-inlined"="__cyg_profile_func_exit"  }
+0 −22
Original line number Diff line number Diff line
@@ -9,8 +9,6 @@ declare tailcc i32 @tailcallee(i32 %a1, i32 %a2)
define tailcc i32 @tailcaller(i32 %in1, i32 %in2) nounwind {
; X64-LABEL: tailcaller:
; X64:       # %bb.0: # %entry
; X64-NEXT:    pushq %rax
; X64-NEXT:    popq %rax
; X64-NEXT:    jmp tailcallee # TAILCALL
;
; X32-LABEL: tailcaller:
@@ -26,8 +24,6 @@ declare tailcc i8* @alias_callee()
define tailcc noalias i8* @noalias_caller() nounwind {
; X64-LABEL: noalias_caller:
; X64:       # %bb.0:
; X64-NEXT:    pushq %rax
; X64-NEXT:    popq %rax
; X64-NEXT:    jmp alias_callee # TAILCALL
;
; X32-LABEL: noalias_caller:
@@ -42,8 +38,6 @@ declare tailcc noalias i8* @noalias_callee()
define tailcc i8* @alias_caller() nounwind {
; X64-LABEL: alias_caller:
; X64:       # %bb.0:
; X64-NEXT:    pushq %rax
; X64-NEXT:    popq %rax
; X64-NEXT:    jmp noalias_callee # TAILCALL
;
; X32-LABEL: alias_caller:
@@ -56,25 +50,17 @@ define tailcc i8* @alias_caller() nounwind {
define tailcc void @void_test(i32, i32, i32, i32) {
; X64-LABEL: void_test:
; X64:       # %bb.0: # %entry
; X64-NEXT:    pushq %rax
; X64-NEXT:    .cfi_def_cfa_offset 16
; X64-NEXT:    popq %rax
; X64-NEXT:    .cfi_def_cfa_offset 8
; X64-NEXT:    jmp void_test # TAILCALL
;
; X32-LABEL: void_test:
; X32:       # %bb.0: # %entry
; X32-NEXT:    pushl %esi
; X32-NEXT:    .cfi_def_cfa_offset 8
; X32-NEXT:    subl $8, %esp
; X32-NEXT:    .cfi_def_cfa_offset 16
; X32-NEXT:    .cfi_offset %esi, -8
; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; X32-NEXT:    addl $8, %esp
; X32-NEXT:    .cfi_def_cfa_offset 8
; X32-NEXT:    popl %esi
; X32-NEXT:    .cfi_def_cfa_offset 4
; X32-NEXT:    jmp void_test # TAILCALL
@@ -86,25 +72,17 @@ define tailcc void @void_test(i32, i32, i32, i32) {
define tailcc i1 @i1test(i32, i32, i32, i32) {
; X64-LABEL: i1test:
; X64:       # %bb.0: # %entry
; X64-NEXT:    pushq %rax
; X64-NEXT:    .cfi_def_cfa_offset 16
; X64-NEXT:    popq %rax
; X64-NEXT:    .cfi_def_cfa_offset 8
; X64-NEXT:    jmp i1test # TAILCALL
;
; X32-LABEL: i1test:
; X32:       # %bb.0: # %entry
; X32-NEXT:    pushl %esi
; X32-NEXT:    .cfi_def_cfa_offset 8
; X32-NEXT:    subl $8, %esp
; X32-NEXT:    .cfi_def_cfa_offset 16
; X32-NEXT:    .cfi_offset %esi, -8
; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; X32-NEXT:    addl $8, %esp
; X32-NEXT:    .cfi_def_cfa_offset 8
; X32-NEXT:    popl %esi
; X32-NEXT:    .cfi_def_cfa_offset 4
; X32-NEXT:    jmp i1test # TAILCALL
+9 −49
Original line number Diff line number Diff line
@@ -246,12 +246,13 @@ define void @f_thunk(i8* %this, ...) {
; X86-NOSSE-NEXT:    movl %esp, %ebp
; X86-NOSSE-NEXT:    pushl %esi
; X86-NOSSE-NEXT:    andl $-16, %esp
; X86-NOSSE-NEXT:    subl $48, %esp
; X86-NOSSE-NEXT:    subl $32, %esp
; X86-NOSSE-NEXT:    movl 8(%ebp), %esi
; X86-NOSSE-NEXT:    leal 12(%ebp), %eax
; X86-NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; X86-NOSSE-NEXT:    movl %esi, (%esp)
; X86-NOSSE-NEXT:    movl %eax, (%esp)
; X86-NOSSE-NEXT:    pushl %esi
; X86-NOSSE-NEXT:    calll _get_f
; X86-NOSSE-NEXT:    addl $4, %esp
; X86-NOSSE-NEXT:    movl %esi, 8(%ebp)
; X86-NOSSE-NEXT:    leal -4(%ebp), %esp
; X86-NOSSE-NEXT:    popl %esi
@@ -264,17 +265,18 @@ define void @f_thunk(i8* %this, ...) {
; X86-SSE-NEXT:    movl %esp, %ebp
; X86-SSE-NEXT:    pushl %esi
; X86-SSE-NEXT:    andl $-16, %esp
; X86-SSE-NEXT:    subl $96, %esp
; X86-SSE-NEXT:    subl $80, %esp
; X86-SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
; X86-SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
; X86-SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
; X86-SSE-NEXT:    movaps %xmm0, (%esp) # 16-byte Spill
; X86-SSE-NEXT:    movl 8(%ebp), %esi
; X86-SSE-NEXT:    leal 12(%ebp), %eax
; X86-SSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; X86-SSE-NEXT:    movl %esi, (%esp)
; X86-SSE-NEXT:    pushl %esi
; X86-SSE-NEXT:    calll _get_f
; X86-SSE-NEXT:    addl $4, %esp
; X86-SSE-NEXT:    movl %esi, 8(%ebp)
; X86-SSE-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
; X86-SSE-NEXT:    movaps (%esp), %xmm0 # 16-byte Reload
; X86-SSE-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
; X86-SSE-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 16-byte Reload
; X86-SSE-NEXT:    leal -4(%ebp), %esp
@@ -300,38 +302,21 @@ define void @f_thunk(i8* %this, ...) {
define void @g_thunk(i8* %fptr_i8, ...) {
; LINUX-LABEL: g_thunk:
; LINUX:       # %bb.0:
; LINUX-NEXT:    pushq %rax
; LINUX-NEXT:    .cfi_def_cfa_offset 16
; LINUX-NEXT:    popq %r11
; LINUX-NEXT:    .cfi_def_cfa_offset 8
; LINUX-NEXT:    jmpq *%rdi # TAILCALL
;
; LINUX-X32-LABEL: g_thunk:
; LINUX-X32:       # %bb.0:
; LINUX-X32-NEXT:    pushq %rax
; LINUX-X32-NEXT:    .cfi_def_cfa_offset 16
; LINUX-X32-NEXT:    movl %edi, %r11d
; LINUX-X32-NEXT:    addl $8, %esp
; LINUX-X32-NEXT:    .cfi_def_cfa_offset 8
; LINUX-X32-NEXT:    jmpq *%r11 # TAILCALL
;
; WINDOWS-LABEL: g_thunk:
; WINDOWS:       # %bb.0:
; WINDOWS-NEXT:    subq $40, %rsp
; WINDOWS-NEXT:    .seh_stackalloc 40
; WINDOWS-NEXT:    .seh_endprologue
; WINDOWS-NEXT:    addq $40, %rsp
; WINDOWS-NEXT:    rex64 jmpq *%rcx # TAILCALL
; WINDOWS-NEXT:    .seh_handlerdata
; WINDOWS-NEXT:    .text
; WINDOWS-NEXT:    .seh_endproc
;
; X86-LABEL: g_thunk:
; X86:       # %bb.0:
; X86-NEXT:    pushl %eax
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; X86-NEXT:    popl %ecx
; X86-NEXT:    jmpl *%eax # TAILCALL
  %fptr = bitcast i8* %fptr_i8 to void (i8*, ...)*
  musttail call void (i8*, ...) %fptr(i8* %fptr_i8, ...)
@@ -347,78 +332,53 @@ define void @g_thunk(i8* %fptr_i8, ...) {
define void @h_thunk(%struct.Foo* %this, ...) {
; LINUX-LABEL: h_thunk:
; LINUX:       # %bb.0:
; LINUX-NEXT:    pushq %rax
; LINUX-NEXT:    .cfi_def_cfa_offset 16
; LINUX-NEXT:    cmpb $1, (%rdi)
; LINUX-NEXT:    jne .LBB2_2
; LINUX-NEXT:  # %bb.1: # %then
; LINUX-NEXT:    movq 8(%rdi), %r11
; LINUX-NEXT:    addq $8, %rsp
; LINUX-NEXT:    .cfi_def_cfa_offset 8
; LINUX-NEXT:    jmpq *%r11 # TAILCALL
; LINUX-NEXT:  .LBB2_2: # %else
; LINUX-NEXT:    .cfi_def_cfa_offset 16
; LINUX-NEXT:    movq 16(%rdi), %r11
; LINUX-NEXT:    movl $42, {{.*}}(%rip)
; LINUX-NEXT:    addq $8, %rsp
; LINUX-NEXT:    .cfi_def_cfa_offset 8
; LINUX-NEXT:    jmpq *%r11 # TAILCALL
;
; LINUX-X32-LABEL: h_thunk:
; LINUX-X32:       # %bb.0:
; LINUX-X32-NEXT:    pushq %rax
; LINUX-X32-NEXT:    .cfi_def_cfa_offset 16
; LINUX-X32-NEXT:    cmpb $1, (%edi)
; LINUX-X32-NEXT:    jne .LBB2_2
; LINUX-X32-NEXT:  # %bb.1: # %then
; LINUX-X32-NEXT:    movl 4(%edi), %r11d
; LINUX-X32-NEXT:    addl $8, %esp
; LINUX-X32-NEXT:    .cfi_def_cfa_offset 8
; LINUX-X32-NEXT:    jmpq *%r11 # TAILCALL
; LINUX-X32-NEXT:  .LBB2_2: # %else
; LINUX-X32-NEXT:    .cfi_def_cfa_offset 16
; LINUX-X32-NEXT:    movl 8(%edi), %r11d
; LINUX-X32-NEXT:    movl $42, {{.*}}(%rip)
; LINUX-X32-NEXT:    addl $8, %esp
; LINUX-X32-NEXT:    .cfi_def_cfa_offset 8
; LINUX-X32-NEXT:    jmpq *%r11 # TAILCALL
;
; WINDOWS-LABEL: h_thunk:
; WINDOWS:       # %bb.0:
; WINDOWS-NEXT:    subq $40, %rsp
; WINDOWS-NEXT:    .seh_stackalloc 40
; WINDOWS-NEXT:    .seh_endprologue
; WINDOWS-NEXT:    cmpb $1, (%rcx)
; WINDOWS-NEXT:    jne .LBB2_2
; WINDOWS-NEXT:  # %bb.1: # %then
; WINDOWS-NEXT:    movq 8(%rcx), %rax
; WINDOWS-NEXT:    addq $40, %rsp
; WINDOWS-NEXT:    rex64 jmpq *%rax # TAILCALL
; WINDOWS-NEXT:  .LBB2_2: # %else
; WINDOWS-NEXT:    movq 16(%rcx), %rax
; WINDOWS-NEXT:    movl $42, {{.*}}(%rip)
; WINDOWS-NEXT:    addq $40, %rsp
; WINDOWS-NEXT:    rex64 jmpq *%rax # TAILCALL
; WINDOWS-NEXT:    .seh_handlerdata
; WINDOWS-NEXT:    .text
; WINDOWS-NEXT:    .seh_endproc
;
; X86-LABEL: h_thunk:
; X86:       # %bb.0:
; X86-NEXT:    pushl %eax
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    cmpb $1, (%eax)
; X86-NEXT:    jne LBB2_2
; X86-NEXT:  # %bb.1: # %then
; X86-NEXT:    movl 4(%eax), %ecx
; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; X86-NEXT:    popl %eax
; X86-NEXT:    jmpl *%ecx # TAILCALL
; X86-NEXT:  LBB2_2: # %else
; X86-NEXT:    movl 8(%eax), %ecx
; X86-NEXT:    movl $42, _g
; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; X86-NEXT:    popl %eax
; X86-NEXT:    jmpl *%ecx # TAILCALL
  %cond_p = getelementptr %struct.Foo, %struct.Foo* %this, i32 0, i32 0
  %cond = load i1, i1* %cond_p