[AMDGPU] Emit stack frame size in metadata (edd67564) · Commits · llvm-doe / llvm-project

llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp

+12 −3

Original line number	Diff line number	Diff line
		@@ -456,9 +456,12 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
		Info = analyzeResourceUsage(MF);
		}

		if (STM.isAmdPalOS() && MFI->isEntryFunction())
		if (STM.isAmdPalOS()) {
		if (MFI->isEntryFunction())
		EmitPALMetadata(MF, CurrentProgramInfo);
		else if (!STM.isAmdHsaOS()) {
		else
		emitPALFunctionMetadata(MF);
		} else if (!STM.isAmdHsaOS()) {
		EmitProgramInfoSI(MF, CurrentProgramInfo);
		}

		@@ -1260,6 +1263,12 @@ void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
		MD->setWave32(MF.getFunction().getCallingConv());
		}

		void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
		auto *MD = getTargetStreamer()->getPALMetadata();
		const MachineFrameInfo &MFI = MF.getFrameInfo();
		MD->setStackFrameSize(MF, MFI.getStackSize());
		}

		// This is supposed to be log2(Size)
		static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) {
		switch (Size) {

llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h

+1 −0

Original line number	Diff line number	Diff line
		@@ -78,6 +78,7 @@ private:
		const SIProgramInfo &KernelInfo);
		void EmitPALMetadata(const MachineFunction &MF,
		const SIProgramInfo &KernelInfo);
		void emitPALFunctionMetadata(const MachineFunction &MF);
		void emitCommonFunctionComments(uint32_t NumVGPR,
		Optional<uint32_t> NumAGPR,
		uint32_t TotalNumVGPR,

llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp

+26 −0

Original line number	Diff line number	Diff line
		@@ -238,6 +238,14 @@ void AMDGPUPALMetadata::setScratchSize(CallingConv::ID CC, unsigned Val) {
		getHwStage(CC)[".scratch_memory_size"] = MsgPackDoc.getNode(Val);
		}

		// Set the scratch size in the metadata.
		void AMDGPUPALMetadata::setStackFrameSize(const MachineFunction &MF,
		unsigned Val) {
		auto Node = MsgPackDoc.getMapNode();
		Node[".stack_frame_size_in_bytes"] = MsgPackDoc.getNode(Val);
		getShaderFunctions()[MF.getFunction().getName()] = Node;
		}

		// Set the hardware register bit in PAL metadata to enable wave32 on the
		// shader of the given calling convention.
		void AMDGPUPALMetadata::setWave32(unsigned CC) {
		@@ -721,6 +729,24 @@ msgpack::MapDocNode AMDGPUPALMetadata::getRegisters() {
		return Registers.getMap();
		}

		// Reference (create if necessary) the node for the shader functions map.
		msgpack::DocNode &AMDGPUPALMetadata::refShaderFunctions() {
		auto &N =
		MsgPackDoc.getRoot()
		.getMap(/Convert=/true)[MsgPackDoc.getNode("amdpal.pipelines")]
		.getArray(/Convert=/true)[0]
		.getMap(/Convert=/true)[MsgPackDoc.getNode(".shader_functions")];
		N.getMap(/Convert=/true);
		return N;
		}

		// Get (create if necessary) the shader functions map.
		msgpack::MapDocNode AMDGPUPALMetadata::getShaderFunctions() {
		if (ShaderFunctions.isEmpty())
		ShaderFunctions = refShaderFunctions();
		return ShaderFunctions.getMap();
		}

		// Return the PAL metadata hardware shader stage name.
		static const char *getStageName(CallingConv::ID CC) {
		switch (CC) {

llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h

+11 −0

Original line number	Diff line number	Diff line
		@@ -15,6 +15,7 @@
		#define LLVM_LIB_TARGET_AMDGPU_AMDGPUPALMETADATA_H

		#include "llvm/BinaryFormat/MsgPackDocument.h"
		#include "llvm/CodeGen/MachineFunction.h"

		namespace llvm {

		@@ -26,6 +27,7 @@ class AMDGPUPALMetadata {
		msgpack::Document MsgPackDoc;
		msgpack::DocNode Registers;
		msgpack::DocNode HwStages;
		msgpack::DocNode ShaderFunctions;

		public:
		// Read the amdgpu.pal.metadata supplied by the frontend, ready for
		@@ -76,6 +78,9 @@ public:
		// Set the scratch size in the metadata.
		void setScratchSize(unsigned CC, unsigned Val);

		// Set the stack frame size of a function in the metadata.
		void setStackFrameSize(const MachineFunction &MF, unsigned Val);

		// Set the hardware register bit in PAL metadata to enable wave32 on the
		// shader of the given calling convention.
		void setWave32(unsigned CC);
		@@ -119,6 +124,12 @@ private:
		// Get (create if necessary) the registers map.
		msgpack::MapDocNode getRegisters();

		// Reference (create if necessary) the node for the shader functions map.
		msgpack::DocNode &refShaderFunctions();

		// Get (create if necessary) the shader functions map.
		msgpack::MapDocNode getShaderFunctions();

		// Get (create if necessary) the .hardware_stages entry for the given calling
		// convention.
		msgpack::MapDocNode getHwStage(unsigned CC);

llvm/test/CodeGen/AMDGPU/amdpal-callable.ll

+159 −14

Original line number	Diff line number	Diff line
		; RUN: llc -mtriple=amdgcn--amdpal -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN -check-prefix=SI -enable-var-scope %s
		; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN -check-prefix=VI -enable-var-scope %s
		; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN -check-prefix=GFX9 -enable-var-scope %s
		; RUN: llc -global-isel -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN -check-prefix=GFX9 -enable-var-scope %s

		; GCN-LABEL: {{^}}gfx_callable_amdpal:
		; GCN: .amdgpu_pal_metadata
		; GCN-NEXT: ---
		; GCN-NEXT: amdpal.pipelines:
		; RUN: llc -mtriple=amdgcn--amdpal -verify-machineinstrs < %s \| FileCheck -check-prefixes=GCN,SDAG -enable-var-scope %s
		; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga -verify-machineinstrs < %s \| FileCheck -check-prefixes=GCN,SDAG -enable-var-scope %s
		; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s \| FileCheck -check-prefixes=GCN,SDAG -enable-var-scope %s
		; RUN: llc -global-isel -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s \| FileCheck -check-prefixes=GCN,GISEL -enable-var-scope %s

		declare float @extern_func(float) #0
		declare float @extern_func_many_args(<64 x float>) #0

		@funcptr = external hidden unnamed_addr addrspace(4) constant void()*, align 4

		define amdgpu_gfx float @no_stack(float %arg0) #0 {
		%add = fadd float %arg0, 1.0
		ret float %add
		}

		define amdgpu_gfx float @simple_stack(float %arg0) #0 {
		%stack = alloca float, i32 4, align 4, addrspace(5)
		store volatile float 2.0, float addrspace(5)* %stack
		%val = load volatile float, float addrspace(5)* %stack
		%add = fadd float %arg0, %val
		ret float %add
		}

		define amdgpu_gfx float @multiple_stack(float %arg0) #0 {
		%stack = alloca float, i32 4, align 4, addrspace(5)
		store volatile float 2.0, float addrspace(5)* %stack
		%val = load volatile float, float addrspace(5)* %stack
		%add = fadd float %arg0, %val
		%stack2 = alloca float, i32 4, align 4, addrspace(5)
		store volatile float 2.0, float addrspace(5)* %stack2
		%val2 = load volatile float, float addrspace(5)* %stack2
		%add2 = fadd float %add, %val2
		ret float %add2
		}

		define amdgpu_gfx float @dynamic_stack(float %arg0) #0 {
		bb0:
		%cmp = fcmp ogt float %arg0, 0.0
		br i1 %cmp, label %bb1, label %bb2

		bb1:
		%stack = alloca float, i32 4, align 4, addrspace(5)
		store volatile float 2.0, float addrspace(5)* %stack
		%val = load volatile float, float addrspace(5)* %stack
		%add = fadd float %arg0, %val
		br label %bb2

		bb2:
		%res = phi float [ 0.0, %bb0 ], [ %add, %bb1 ]
		ret float %res
		}

		define amdgpu_gfx float @dynamic_stack_loop(float %arg0) #0 {
		bb0:
		br label %bb1

		bb1:
		%ctr = phi i32 [ 0, %bb0 ], [ %newctr, %bb1 ]
		%stack = alloca float, i32 4, align 4, addrspace(5)
		store volatile float 2.0, float addrspace(5)* %stack
		%val = load volatile float, float addrspace(5)* %stack
		%add = fadd float %arg0, %val
		%cmp = icmp sgt i32 %ctr, 0
		%newctr = sub i32 %ctr, 1
		br i1 %cmp, label %bb1, label %bb2

		bb2:
		ret float %add
		}

		define amdgpu_gfx float @no_stack_call(float %arg0) #0 {
		%res = call amdgpu_gfx float @simple_stack(float %arg0)
		ret float %res
		}

		define amdgpu_gfx float @simple_stack_call(float %arg0) #0 {
		%stack = alloca float, i32 4, align 4, addrspace(5)
		store volatile float 2.0, float addrspace(5)* %stack
		%val = load volatile float, float addrspace(5)* %stack
		%res = call amdgpu_gfx float @simple_stack(float %arg0)
		%add = fadd float %res, %val
		ret float %add
		}

		define amdgpu_gfx float @no_stack_extern_call(float %arg0) #0 {
		%res = call amdgpu_gfx float @extern_func(float %arg0)
		ret float %res
		}

		define amdgpu_gfx float @simple_stack_extern_call(float %arg0) #0 {
		%stack = alloca float, i32 4, align 4, addrspace(5)
		store volatile float 2.0, float addrspace(5)* %stack
		%val = load volatile float, float addrspace(5)* %stack
		%res = call amdgpu_gfx float @extern_func(float %arg0)
		%add = fadd float %res, %val
		ret float %add
		}

		define amdgpu_gfx float @no_stack_extern_call_many_args(<64 x float> %arg0) #0 {
		%res = call amdgpu_gfx float @extern_func_many_args(<64 x float> %arg0)
		ret float %res
		}

		define amdgpu_gfx float @no_stack_indirect_call(float %arg0) #0 {
		%fptr = load void(), void() addrspace(4)* @funcptr
		call amdgpu_gfx void %fptr()
		ret float %arg0
		}

		define amdgpu_gfx float @simple_stack_indirect_call(float %arg0) #0 {
		%stack = alloca float, i32 4, align 4, addrspace(5)
		store volatile float 2.0, float addrspace(5)* %stack
		%val = load volatile float, float addrspace(5)* %stack
		%fptr = load void(), void() addrspace(4)* @funcptr
		call amdgpu_gfx void %fptr()
		%add = fadd float %arg0, %val
		ret float %add
		}

		define amdgpu_gfx float @simple_stack_recurse(float %arg0) #0 {
		%stack = alloca float, i32 4, align 4, addrspace(5)
		store volatile float 2.0, float addrspace(5)* %stack
		%val = load volatile float, float addrspace(5)* %stack
		%res = call amdgpu_gfx float @simple_stack_recurse(float %arg0)
		%add = fadd float %res, %val
		ret float %add
		}

		attributes #0 = { nounwind }

		; GCN: amdpal.pipelines:
		; GCN-NEXT: - .registers: {}
		; GCN-NEXT: .shader_functions:
		; GCN-NEXT: dynamic_stack:
		; GCN-NEXT: .stack_frame_size_in_bytes: 0x10{{$}}
		; GCN-NEXT: dynamic_stack_loop:
		; GCN-NEXT: .stack_frame_size_in_bytes: 0x10{{$}}
		; GCN-NEXT: multiple_stack:
		; GCN-NEXT: .stack_frame_size_in_bytes: 0x24{{$}}
		; GCN-NEXT: no_stack:
		; GCN-NEXT: .stack_frame_size_in_bytes: 0{{$}}
		; GCN-NEXT: no_stack_call:
		; GCN-NEXT: .stack_frame_size_in_bytes: 0{{$}}
		; GCN-NEXT: no_stack_extern_call:
		; GCN-NEXT: .stack_frame_size_in_bytes: 0x10{{$}}
		; GCN-NEXT: no_stack_extern_call_many_args:
		; SDAG-NEXT: .stack_frame_size_in_bytes: 0x90{{$}}
		; GISEL-NEXT: .stack_frame_size_in_bytes: 0xd0{{$}}
		; GCN-NEXT: no_stack_indirect_call:
		; GCN-NEXT: .stack_frame_size_in_bytes: 0x10{{$}}
		; GCN-NEXT: simple_stack:
		; GCN-NEXT: .stack_frame_size_in_bytes: 0x14{{$}}
		; GCN-NEXT: simple_stack_call:
		; GCN-NEXT: .stack_frame_size_in_bytes: 0x20{{$}}
		; GCN-NEXT: simple_stack_extern_call:
		; GCN-NEXT: .stack_frame_size_in_bytes: 0x20{{$}}
		; GCN-NEXT: simple_stack_indirect_call:
		; GCN-NEXT: .stack_frame_size_in_bytes: 0x20{{$}}
		; GCN-NEXT: simple_stack_recurse:
		; GCN-NEXT: .stack_frame_size_in_bytes: 0x20{{$}}
		; GCN-NEXT: ...
		; GCN-NEXT: .end_amdgpu_pal_metadata
		define amdgpu_gfx half @gfx_callable_amdpal(half %arg0) {
		%add = fadd half %arg0, 1.0
		ret half %add
		}