Commit edd67564 authored by Sebastian Neubauer's avatar Sebastian Neubauer
Browse files

[AMDGPU] Emit stack frame size in metadata

Add .shader_functions to pal metadata, which contains the stack frame
size for all non-entry-point functions.

Differential Revision: https://reviews.llvm.org/D90036
parent e73d8c79
Loading
Loading
Loading
Loading
+12 −3
Original line number Diff line number Diff line
@@ -456,9 +456,12 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
    Info = analyzeResourceUsage(MF);
  }

  if (STM.isAmdPalOS() && MFI->isEntryFunction())
  if (STM.isAmdPalOS()) {
    if (MFI->isEntryFunction())
      EmitPALMetadata(MF, CurrentProgramInfo);
  else if (!STM.isAmdHsaOS()) {
    else
      emitPALFunctionMetadata(MF);
  } else if (!STM.isAmdHsaOS()) {
    EmitProgramInfoSI(MF, CurrentProgramInfo);
  }

@@ -1260,6 +1263,12 @@ void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
    MD->setWave32(MF.getFunction().getCallingConv());
}

void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
  auto *MD = getTargetStreamer()->getPALMetadata();
  const MachineFrameInfo &MFI = MF.getFrameInfo();
  MD->setStackFrameSize(MF, MFI.getStackSize());
}

// This is supposed to be log2(Size)
static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) {
  switch (Size) {
+1 −0
Original line number Diff line number Diff line
@@ -78,6 +78,7 @@ private:
                         const SIProgramInfo &KernelInfo);
  void EmitPALMetadata(const MachineFunction &MF,
                       const SIProgramInfo &KernelInfo);
  void emitPALFunctionMetadata(const MachineFunction &MF);
  void emitCommonFunctionComments(uint32_t NumVGPR,
                                  Optional<uint32_t> NumAGPR,
                                  uint32_t TotalNumVGPR,
+26 −0
Original line number Diff line number Diff line
@@ -238,6 +238,14 @@ void AMDGPUPALMetadata::setScratchSize(CallingConv::ID CC, unsigned Val) {
  getHwStage(CC)[".scratch_memory_size"] = MsgPackDoc.getNode(Val);
}

// Set the scratch size in the metadata.
void AMDGPUPALMetadata::setStackFrameSize(const MachineFunction &MF,
                                          unsigned Val) {
  auto Node = MsgPackDoc.getMapNode();
  Node[".stack_frame_size_in_bytes"] = MsgPackDoc.getNode(Val);
  getShaderFunctions()[MF.getFunction().getName()] = Node;
}

// Set the hardware register bit in PAL metadata to enable wave32 on the
// shader of the given calling convention.
void AMDGPUPALMetadata::setWave32(unsigned CC) {
@@ -721,6 +729,24 @@ msgpack::MapDocNode AMDGPUPALMetadata::getRegisters() {
  return Registers.getMap();
}

// Reference (create if necessary) the node for the shader functions map.
msgpack::DocNode &AMDGPUPALMetadata::refShaderFunctions() {
  auto &N =
      MsgPackDoc.getRoot()
          .getMap(/*Convert=*/true)[MsgPackDoc.getNode("amdpal.pipelines")]
          .getArray(/*Convert=*/true)[0]
          .getMap(/*Convert=*/true)[MsgPackDoc.getNode(".shader_functions")];
  N.getMap(/*Convert=*/true);
  return N;
}

// Get (create if necessary) the shader functions map.
msgpack::MapDocNode AMDGPUPALMetadata::getShaderFunctions() {
  if (ShaderFunctions.isEmpty())
    ShaderFunctions = refShaderFunctions();
  return ShaderFunctions.getMap();
}

// Return the PAL metadata hardware shader stage name.
static const char *getStageName(CallingConv::ID CC) {
  switch (CC) {
+11 −0
Original line number Diff line number Diff line
@@ -15,6 +15,7 @@
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUPALMETADATA_H

#include "llvm/BinaryFormat/MsgPackDocument.h"
#include "llvm/CodeGen/MachineFunction.h"

namespace llvm {

@@ -26,6 +27,7 @@ class AMDGPUPALMetadata {
  msgpack::Document MsgPackDoc;
  msgpack::DocNode Registers;
  msgpack::DocNode HwStages;
  msgpack::DocNode ShaderFunctions;

public:
  // Read the amdgpu.pal.metadata supplied by the frontend, ready for
@@ -76,6 +78,9 @@ public:
  // Set the scratch size in the metadata.
  void setScratchSize(unsigned CC, unsigned Val);

  // Set the stack frame size of a function in the metadata.
  void setStackFrameSize(const MachineFunction &MF, unsigned Val);

  // Set the hardware register bit in PAL metadata to enable wave32 on the
  // shader of the given calling convention.
  void setWave32(unsigned CC);
@@ -119,6 +124,12 @@ private:
  // Get (create if necessary) the registers map.
  msgpack::MapDocNode getRegisters();

  // Reference (create if necessary) the node for the shader functions map.
  msgpack::DocNode &refShaderFunctions();

  // Get (create if necessary) the shader functions map.
  msgpack::MapDocNode getShaderFunctions();

  // Get (create if necessary) the .hardware_stages entry for the given calling
  // convention.
  msgpack::MapDocNode getHwStage(unsigned CC);
+159 −14
Original line number Diff line number Diff line
; RUN: llc -mtriple=amdgcn--amdpal -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -enable-var-scope %s
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -enable-var-scope %s
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -enable-var-scope %s
; RUN: llc -global-isel -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -enable-var-scope %s

; GCN-LABEL: {{^}}gfx_callable_amdpal:
; GCN:         .amdgpu_pal_metadata
; GCN-NEXT: ---
; GCN-NEXT: amdpal.pipelines:
; RUN: llc -mtriple=amdgcn--amdpal -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SDAG -enable-var-scope %s
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SDAG -enable-var-scope %s
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SDAG -enable-var-scope %s
; RUN: llc -global-isel -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GISEL -enable-var-scope %s

declare float @extern_func(float) #0
declare float @extern_func_many_args(<64 x float>) #0

@funcptr = external hidden unnamed_addr addrspace(4) constant void()*, align 4

define amdgpu_gfx float @no_stack(float %arg0) #0 {
  %add = fadd float %arg0, 1.0
  ret float %add
}

define amdgpu_gfx float @simple_stack(float %arg0) #0 {
  %stack = alloca float, i32 4, align 4, addrspace(5)
  store volatile float 2.0, float addrspace(5)* %stack
  %val = load volatile float, float addrspace(5)* %stack
  %add = fadd float %arg0, %val
  ret float %add
}

define amdgpu_gfx float @multiple_stack(float %arg0) #0 {
  %stack = alloca float, i32 4, align 4, addrspace(5)
  store volatile float 2.0, float addrspace(5)* %stack
  %val = load volatile float, float addrspace(5)* %stack
  %add = fadd float %arg0, %val
  %stack2 = alloca float, i32 4, align 4, addrspace(5)
  store volatile float 2.0, float addrspace(5)* %stack2
  %val2 = load volatile float, float addrspace(5)* %stack2
  %add2 = fadd float %add, %val2
  ret float %add2
}

define amdgpu_gfx float @dynamic_stack(float %arg0) #0 {
bb0:
  %cmp = fcmp ogt float %arg0, 0.0
  br i1 %cmp, label %bb1, label %bb2

bb1:
  %stack = alloca float, i32 4, align 4, addrspace(5)
  store volatile float 2.0, float addrspace(5)* %stack
  %val = load volatile float, float addrspace(5)* %stack
  %add = fadd float %arg0, %val
  br label %bb2

bb2:
  %res = phi float [ 0.0, %bb0 ], [ %add, %bb1 ]
  ret float %res
}

define amdgpu_gfx float @dynamic_stack_loop(float %arg0) #0 {
bb0:
  br label %bb1

bb1:
  %ctr = phi i32 [ 0, %bb0 ], [ %newctr, %bb1 ]
  %stack = alloca float, i32 4, align 4, addrspace(5)
  store volatile float 2.0, float addrspace(5)* %stack
  %val = load volatile float, float addrspace(5)* %stack
  %add = fadd float %arg0, %val
  %cmp = icmp sgt i32 %ctr, 0
  %newctr = sub i32 %ctr, 1
  br i1 %cmp, label %bb1, label %bb2

bb2:
  ret float %add
}

define amdgpu_gfx float @no_stack_call(float %arg0) #0 {
  %res = call amdgpu_gfx float @simple_stack(float %arg0)
  ret float %res
}

define amdgpu_gfx float @simple_stack_call(float %arg0) #0 {
  %stack = alloca float, i32 4, align 4, addrspace(5)
  store volatile float 2.0, float addrspace(5)* %stack
  %val = load volatile float, float addrspace(5)* %stack
  %res = call amdgpu_gfx float @simple_stack(float %arg0)
  %add = fadd float %res, %val
  ret float %add
}

define amdgpu_gfx float @no_stack_extern_call(float %arg0) #0 {
  %res = call amdgpu_gfx float @extern_func(float %arg0)
  ret float %res
}

define amdgpu_gfx float @simple_stack_extern_call(float %arg0) #0 {
  %stack = alloca float, i32 4, align 4, addrspace(5)
  store volatile float 2.0, float addrspace(5)* %stack
  %val = load volatile float, float addrspace(5)* %stack
  %res = call amdgpu_gfx float @extern_func(float %arg0)
  %add = fadd float %res, %val
  ret float %add
}

define amdgpu_gfx float @no_stack_extern_call_many_args(<64 x float> %arg0) #0 {
  %res = call amdgpu_gfx float @extern_func_many_args(<64 x float> %arg0)
  ret float %res
}

define amdgpu_gfx float @no_stack_indirect_call(float %arg0) #0 {
  %fptr = load void()*, void()* addrspace(4)* @funcptr
  call amdgpu_gfx void %fptr()
  ret float %arg0
}

define amdgpu_gfx float @simple_stack_indirect_call(float %arg0) #0 {
  %stack = alloca float, i32 4, align 4, addrspace(5)
  store volatile float 2.0, float addrspace(5)* %stack
  %val = load volatile float, float addrspace(5)* %stack
  %fptr = load void()*, void()* addrspace(4)* @funcptr
  call amdgpu_gfx void %fptr()
  %add = fadd float %arg0, %val
  ret float %add
}

define amdgpu_gfx float @simple_stack_recurse(float %arg0) #0 {
  %stack = alloca float, i32 4, align 4, addrspace(5)
  store volatile float 2.0, float addrspace(5)* %stack
  %val = load volatile float, float addrspace(5)* %stack
  %res = call amdgpu_gfx float @simple_stack_recurse(float %arg0)
  %add = fadd float %res, %val
  ret float %add
}

attributes #0 = { nounwind }

; GCN: amdpal.pipelines:
; GCN-NEXT:   - .registers:      {}
; GCN-NEXT:    .shader_functions:
; GCN-NEXT:      dynamic_stack:
; GCN-NEXT:        .stack_frame_size_in_bytes: 0x10{{$}}
; GCN-NEXT:      dynamic_stack_loop:
; GCN-NEXT:        .stack_frame_size_in_bytes: 0x10{{$}}
; GCN-NEXT:      multiple_stack:
; GCN-NEXT:        .stack_frame_size_in_bytes: 0x24{{$}}
; GCN-NEXT:      no_stack:
; GCN-NEXT:        .stack_frame_size_in_bytes: 0{{$}}
; GCN-NEXT:      no_stack_call:
; GCN-NEXT:        .stack_frame_size_in_bytes: 0{{$}}
; GCN-NEXT:      no_stack_extern_call:
; GCN-NEXT:        .stack_frame_size_in_bytes: 0x10{{$}}
; GCN-NEXT:      no_stack_extern_call_many_args:
; SDAG-NEXT:        .stack_frame_size_in_bytes: 0x90{{$}}
; GISEL-NEXT:        .stack_frame_size_in_bytes: 0xd0{{$}}
; GCN-NEXT:      no_stack_indirect_call:
; GCN-NEXT:        .stack_frame_size_in_bytes: 0x10{{$}}
; GCN-NEXT:      simple_stack:
; GCN-NEXT:        .stack_frame_size_in_bytes: 0x14{{$}}
; GCN-NEXT:      simple_stack_call:
; GCN-NEXT:        .stack_frame_size_in_bytes: 0x20{{$}}
; GCN-NEXT:      simple_stack_extern_call:
; GCN-NEXT:        .stack_frame_size_in_bytes: 0x20{{$}}
; GCN-NEXT:      simple_stack_indirect_call:
; GCN-NEXT:        .stack_frame_size_in_bytes: 0x20{{$}}
; GCN-NEXT:      simple_stack_recurse:
; GCN-NEXT:        .stack_frame_size_in_bytes: 0x20{{$}}
; GCN-NEXT: ...
; GCN-NEXT:         .end_amdgpu_pal_metadata
define amdgpu_gfx half @gfx_callable_amdpal(half %arg0) {
  %add = fadd half %arg0, 1.0
  ret half %add
}