AMDGPU: Try to use private version of sincos if available (f44beecb) · Commits · llvm-doe / llvm-project

llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp

+21 −15

Original line number	Diff line number	Diff line
		@@ -1065,18 +1065,18 @@ AMDGPULibCalls::insertSinCos(Value *Arg, FastMathFlags FMF, IRBuilder<> &B,
		B.SetCurrentDebugLocation(DL);
		}

		Value *P = Alloc;
		Type *PTy = Fsincos.getFunctionType()->getParamType(1);
		Type *CosPtrTy = Fsincos.getFunctionType()->getParamType(1);

		// The allocaInst allocates the memory in private address space. This need
		// to be bitcasted to point to the address space of cos pointer type.
		// to be addrspacecasted to point to the address space of cos pointer type.
		// In OpenCL 2.0 this is generic, while in 1.2 that is private.
		if (PTy->getPointerAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS)
		P = B.CreateAddrSpaceCast(Alloc, PTy);
		Value *CastAlloc = B.CreateAddrSpaceCast(Alloc, CosPtrTy);

		CallInst *SinCos = CreateCallEx2(B, Fsincos, Arg, P);
		CallInst *SinCos = CreateCallEx2(B, Fsincos, Arg, CastAlloc);

		// TODO: Is it worth trying to preserve the location for the cos calls for the
		// load?

		LoadInst *LoadCos = B.CreateLoad(Alloc->getAllocatedType(), Alloc);
		return {SinCos, LoadCos, SinCos};
		}
		@@ -1100,15 +1100,19 @@ bool AMDGPULibCalls::fold_sincos(FPMathOperator *FPOp, IRBuilder<> &B,
		Function *F = B.GetInsertBlock()->getParent();
		Module *M = F->getParent();

		// Merge the sin and cos.
		// Merge the sin and cos. For OpenCL 2.0, there may only be a generic pointer
		// implementation. Prefer the private form if available.
		AMDGPULibFunc SinCosLibFuncPrivate(AMDGPULibFunc::EI_SINCOS, fInfo);
		SinCosLibFuncPrivate.getLeads()[0].PtrKind =
		AMDGPULibFunc::getEPtrKindFromAddrSpace(AMDGPUAS::PRIVATE_ADDRESS);

		// for OpenCL 2.0 we have only generic implementation of sincos
		// function.
		// FIXME: This is not true anymore
		AMDGPULibFunc SinCosLibFunc(AMDGPULibFunc::EI_SINCOS, fInfo);
		SinCosLibFunc.getLeads()[0].PtrKind =
		AMDGPULibFunc SinCosLibFuncGeneric(AMDGPULibFunc::EI_SINCOS, fInfo);
		SinCosLibFuncGeneric.getLeads()[0].PtrKind =
		AMDGPULibFunc::getEPtrKindFromAddrSpace(AMDGPUAS::FLAT_ADDRESS);
		FunctionCallee FSinCos = getFunction(M, SinCosLibFunc);

		FunctionCallee FSinCosPrivate = getFunction(M, SinCosLibFuncPrivate);
		FunctionCallee FSinCosGeneric = getFunction(M, SinCosLibFuncGeneric);
		FunctionCallee FSinCos = FSinCosPrivate ? FSinCosPrivate : FSinCosGeneric;
		if (!FSinCos)
		return false;

		@@ -1121,7 +1125,8 @@ bool AMDGPULibCalls::fold_sincos(FPMathOperator *FPOp, IRBuilder<> &B,

		StringRef SinName = isSin ? CI->getCalledFunction()->getName() : PairName;
		StringRef CosName = isSin ? PairName : CI->getCalledFunction()->getName();
		const std::string SinCosName = SinCosLibFunc.mangle();
		const std::string SinCosPrivateName = SinCosLibFuncPrivate.mangle();
		const std::string SinCosGenericName = SinCosLibFuncGeneric.mangle();

		// Intersect the two sets of flags.
		FastMathFlags FMF = FPOp->getFastMathFlags();
		@@ -1144,7 +1149,8 @@ bool AMDGPULibCalls::fold_sincos(FPMathOperator *FPOp, IRBuilder<> &B,
		SinCalls.push_back(XI);
		else if (UCallee->getName() == CosName)
		CosCalls.push_back(XI);
		else if (UCallee->getName() == SinCosName)
		else if (UCallee->getName() == SinCosPrivateName \|\|
		UCallee->getName() == SinCosGenericName)
		SinCosCalls.push_back(XI);
		else
		Handled = false;

llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.defined.ll

+12 −15

Original line number	Diff line number	Diff line
		@@ -106,12 +106,11 @@ define void @sincos_f32(float %x, ptr addrspace(1) nocapture writeonly %sin_out,
		; CHECK-SAME: (float [[X:%.]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) {
		; CHECK-NEXT: entry:
		; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5)
		; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr
		; CHECK-NEXT: [[TMP1:%.*]] = call contract float @_Z6sincosfPU3AS0f(float [[X]], ptr [[TMP0]])
		; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4
		; CHECK-NEXT: store float [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 4
		; CHECK-NEXT: [[TMP0:%.*]] = call contract float @_Z6sincosfPU3AS5f(float [[X]], ptr addrspace(5) [[__SINCOS_]])
		; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4
		; CHECK-NEXT: store float [[TMP0]], ptr addrspace(1) [[SIN_OUT]], align 4
		; CHECK-NEXT: [[CALL1:%.*]] = tail call contract float @_Z3cosf(float [[X]])
		; CHECK-NEXT: store float [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 4
		; CHECK-NEXT: store float [[TMP1]], ptr addrspace(1) [[COS_OUT]], align 4
		; CHECK-NEXT: ret void
		;
		entry:
		@@ -127,12 +126,11 @@ define void @sincos_f32_value_is_same_constantfp(ptr addrspace(1) nocapture writ
		; CHECK-SAME: (ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.]]) {
		; CHECK-NEXT: entry:
		; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5)
		; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr
		; CHECK-NEXT: [[TMP1:%.*]] = call contract float @_Z6sincosfPU3AS0f(float 4.200000e+01, ptr [[TMP0]])
		; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4
		; CHECK-NEXT: store float [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 4
		; CHECK-NEXT: [[TMP0:%.*]] = call contract float @_Z6sincosfPU3AS5f(float 4.200000e+01, ptr addrspace(5) [[__SINCOS_]])
		; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4
		; CHECK-NEXT: store float [[TMP0]], ptr addrspace(1) [[SIN_OUT]], align 4
		; CHECK-NEXT: [[CALL1:%.*]] = tail call contract float @_Z3cosf(float 4.200000e+01)
		; CHECK-NEXT: store float [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 4
		; CHECK-NEXT: store float [[TMP1]], ptr addrspace(1) [[COS_OUT]], align 4
		; CHECK-NEXT: ret void
		;
		entry:
		@@ -157,12 +155,11 @@ define void @sincos_v2f32(<2 x float> %x, ptr addrspace(1) nocapture writeonly %
		; CHECK-SAME: (<2 x float> [[X:%.]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) {
		; CHECK-NEXT: entry:
		; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca <2 x float>, align 8, addrspace(5)
		; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr
		; CHECK-NEXT: [[TMP1:%.*]] = call contract <2 x float> @_Z6sincosDv2_fPU3AS0S_(<2 x float> [[X]], ptr [[TMP0]])
		; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr addrspace(5) [[__SINCOS_]], align 8
		; CHECK-NEXT: store <2 x float> [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 8
		; CHECK-NEXT: [[TMP0:%.*]] = call contract <2 x float> @_Z6sincosDv2_fPU3AS5S_(<2 x float> [[X]], ptr addrspace(5) [[__SINCOS_]])
		; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr addrspace(5) [[__SINCOS_]], align 8
		; CHECK-NEXT: store <2 x float> [[TMP0]], ptr addrspace(1) [[SIN_OUT]], align 8
		; CHECK-NEXT: [[CALL1:%.*]] = tail call contract <2 x float> @_Z3cosDv2_f(<2 x float> [[X]])
		; CHECK-NEXT: store <2 x float> [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 8
		; CHECK-NEXT: store <2 x float> [[TMP1]], ptr addrspace(1) [[COS_OUT]], align 8
		; CHECK-NEXT: ret void
		;
		entry:

llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.ll

+151 −189

File changed.

Preview size limit exceeded, changes collapsed.

llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.nobuiltins.ll

+16 −20

Original line number	Diff line number	Diff line
		@@ -14,12 +14,11 @@ define void @sincos_f32_nobuiltin(float noundef %x, ptr addrspace(1) nocapture n
		; CHECK-SAME: (float noundef [[X:%.]], ptr addrspace(1) nocapture noundef writeonly [[SIN_OUT:%.]], ptr addrspace(1) nocapture noundef writeonly [[COS_OUT:%.*]]) #[[ATTR0:[0-9]+]] {
		; CHECK-NEXT: entry:
		; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5)
		; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr
		; CHECK-NEXT: [[TMP1:%.*]] = call contract float @_Z6sincosfPU3AS0f(float [[X]], ptr [[TMP0]])
		; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4
		; CHECK-NEXT: store float [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 4
		; CHECK-NEXT: [[TMP0:%.*]] = call contract float @_Z6sincosfPU3AS5f(float [[X]], ptr addrspace(5) [[__SINCOS_]])
		; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4
		; CHECK-NEXT: store float [[TMP0]], ptr addrspace(1) [[SIN_OUT]], align 4
		; CHECK-NEXT: [[CALL1:%.*]] = tail call contract float @_Z3cosf(float noundef [[X]])
		; CHECK-NEXT: store float [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 4
		; CHECK-NEXT: store float [[TMP1]], ptr addrspace(1) [[COS_OUT]], align 4
		; CHECK-NEXT: ret void
		;
		entry:
		@@ -36,12 +35,11 @@ define void @sincos_v2f32_nobuiltin(<2 x float> noundef %x, ptr addrspace(1) noc
		; CHECK-SAME: (<2 x float> noundef [[X:%.]], ptr addrspace(1) nocapture noundef writeonly [[SIN_OUT:%.]], ptr addrspace(1) nocapture noundef writeonly [[COS_OUT:%.*]]) #[[ATTR0]] {
		; CHECK-NEXT: entry:
		; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca <2 x float>, align 8, addrspace(5)
		; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr
		; CHECK-NEXT: [[TMP1:%.*]] = call contract <2 x float> @_Z6sincosDv2_fPU3AS0S_(<2 x float> [[X]], ptr [[TMP0]])
		; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr addrspace(5) [[__SINCOS_]], align 8
		; CHECK-NEXT: store <2 x float> [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 8
		; CHECK-NEXT: [[TMP0:%.*]] = call contract <2 x float> @_Z6sincosDv2_fPU3AS5S_(<2 x float> [[X]], ptr addrspace(5) [[__SINCOS_]])
		; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr addrspace(5) [[__SINCOS_]], align 8
		; CHECK-NEXT: store <2 x float> [[TMP0]], ptr addrspace(1) [[SIN_OUT]], align 8
		; CHECK-NEXT: [[CALL1:%.*]] = tail call contract <2 x float> @_Z3cosDv2_f(<2 x float> noundef [[X]])
		; CHECK-NEXT: store <2 x float> [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 8
		; CHECK-NEXT: store <2 x float> [[TMP1]], ptr addrspace(1) [[COS_OUT]], align 8
		; CHECK-NEXT: ret void
		;
		entry:
		@@ -57,12 +55,11 @@ define void @sincos_f32_no_builtins(float noundef %x, ptr addrspace(1) nocapture
		; CHECK-SAME: (float noundef [[X:%.]], ptr addrspace(1) nocapture noundef writeonly [[SIN_OUT:%.]], ptr addrspace(1) nocapture noundef writeonly [[COS_OUT:%.*]]) #[[ATTR1:[0-9]+]] {
		; CHECK-NEXT: entry:
		; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5)
		; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr
		; CHECK-NEXT: [[TMP1:%.*]] = call contract float @_Z6sincosfPU3AS0f(float [[X]], ptr [[TMP0]])
		; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4
		; CHECK-NEXT: store float [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 4
		; CHECK-NEXT: [[TMP0:%.*]] = call contract float @_Z6sincosfPU3AS5f(float [[X]], ptr addrspace(5) [[__SINCOS_]])
		; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4
		; CHECK-NEXT: store float [[TMP0]], ptr addrspace(1) [[SIN_OUT]], align 4
		; CHECK-NEXT: [[CALL1:%.*]] = tail call contract float @_Z3cosf(float noundef [[X]])
		; CHECK-NEXT: store float [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 4
		; CHECK-NEXT: store float [[TMP1]], ptr addrspace(1) [[COS_OUT]], align 4
		; CHECK-NEXT: ret void
		;
		entry:
		@@ -78,12 +75,11 @@ define void @sincos_v2f32_no_builtins(<2 x float> noundef %x, ptr addrspace(1) n
		; CHECK-SAME: (<2 x float> noundef [[X:%.]], ptr addrspace(1) nocapture noundef writeonly [[SIN_OUT:%.]], ptr addrspace(1) nocapture noundef writeonly [[COS_OUT:%.*]]) #[[ATTR1]] {
		; CHECK-NEXT: entry:
		; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca <2 x float>, align 8, addrspace(5)
		; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr
		; CHECK-NEXT: [[TMP1:%.*]] = call contract <2 x float> @_Z6sincosDv2_fPU3AS0S_(<2 x float> [[X]], ptr [[TMP0]])
		; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr addrspace(5) [[__SINCOS_]], align 8
		; CHECK-NEXT: store <2 x float> [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 8
		; CHECK-NEXT: [[TMP0:%.*]] = call contract <2 x float> @_Z6sincosDv2_fPU3AS5S_(<2 x float> [[X]], ptr addrspace(5) [[__SINCOS_]])
		; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr addrspace(5) [[__SINCOS_]], align 8
		; CHECK-NEXT: store <2 x float> [[TMP0]], ptr addrspace(1) [[SIN_OUT]], align 8
		; CHECK-NEXT: [[CALL1:%.*]] = tail call contract <2 x float> @_Z3cosDv2_f(<2 x float> noundef [[X]])
		; CHECK-NEXT: store <2 x float> [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 8
		; CHECK-NEXT: store <2 x float> [[TMP1]], ptr addrspace(1) [[COS_OUT]], align 8
		; CHECK-NEXT: ret void
		;
		entry:

llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll

+6 −6

Original line number	Diff line number	Diff line
		@@ -8,7 +8,7 @@
		; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos
		; GCN-POSTLINK: call fast float @_Z3sinf(
		; GCN-POSTLINK: call fast float @_Z3cosf(
		; GCN-PRELINK: call fast float @_Z6sincosfPf(
		; GCN-PRELINK: call fast float @_Z6sincosfPU3AS5f(
		; GCN-NATIVE: call fast float @_Z10native_sinf(
		; GCN-NATIVE: call fast float @_Z10native_cosf(
		define amdgpu_kernel void @test_sincos(ptr addrspace(1) nocapture %a) {
		@@ -29,7 +29,7 @@ declare float @_Z3cosf(float)
		; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v2
		; GCN-POSTLINK: call fast <2 x float> @_Z3sinDv2_f(
		; GCN-POSTLINK: call fast <2 x float> @_Z3cosDv2_f(
		; GCN-PRELINK: call fast <2 x float> @_Z6sincosDv2_fPS_(
		; GCN-PRELINK: call fast <2 x float> @_Z6sincosDv2_fPU3AS5S_(
		; GCN-NATIVE: call fast <2 x float> @_Z10native_sinDv2_f(
		; GCN-NATIVE: call fast <2 x float> @_Z10native_cosDv2_f(
		define amdgpu_kernel void @test_sincos_v2(ptr addrspace(1) nocapture %a) {
		@@ -50,7 +50,7 @@ declare <2 x float> @_Z3cosDv2_f(<2 x float>)
		; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v3
		; GCN-POSTLINK: call fast <3 x float> @_Z3sinDv3_f(
		; GCN-POSTLINK: call fast <3 x float> @_Z3cosDv3_f(
		; GCN-PRELINK: call fast <3 x float> @_Z6sincosDv3_fPS_(
		; GCN-PRELINK: call fast <3 x float> @_Z6sincosDv3_fPU3AS5S_(
		; GCN-NATIVE: call fast <3 x float> @_Z10native_sinDv3_f(
		; GCN-NATIVE: call fast <3 x float> @_Z10native_cosDv3_f(
		define amdgpu_kernel void @test_sincos_v3(ptr addrspace(1) nocapture %a) {
		@@ -74,7 +74,7 @@ declare <3 x float> @_Z3cosDv3_f(<3 x float>)
		; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v4
		; GCN-POSTLINK: call fast <4 x float> @_Z3sinDv4_f(
		; GCN-POSTLINK: call fast <4 x float> @_Z3cosDv4_f(
		; GCN-PRELINK: call fast <4 x float> @_Z6sincosDv4_fPS_(
		; GCN-PRELINK: call fast <4 x float> @_Z6sincosDv4_fPU3AS5S_(
		; GCN-NATIVE: call fast <4 x float> @_Z10native_sinDv4_f(
		; GCN-NATIVE: call fast <4 x float> @_Z10native_cosDv4_f(
		define amdgpu_kernel void @test_sincos_v4(ptr addrspace(1) nocapture %a) {
		@@ -95,7 +95,7 @@ declare <4 x float> @_Z3cosDv4_f(<4 x float>)
		; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v8
		; GCN-POSTLINK: call fast <8 x float> @_Z3sinDv8_f(
		; GCN-POSTLINK: call fast <8 x float> @_Z3cosDv8_f(
		; GCN-PRELINK: call fast <8 x float> @_Z6sincosDv8_fPS_(
		; GCN-PRELINK: call fast <8 x float> @_Z6sincosDv8_fPU3AS5S_(
		; GCN-NATIVE: call fast <8 x float> @_Z10native_sinDv8_f(
		; GCN-NATIVE: call fast <8 x float> @_Z10native_cosDv8_f(
		define amdgpu_kernel void @test_sincos_v8(ptr addrspace(1) nocapture %a) {
		@@ -116,7 +116,7 @@ declare <8 x float> @_Z3cosDv8_f(<8 x float>)
		; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v16
		; GCN-POSTLINK: call fast <16 x float> @_Z3sinDv16_f(
		; GCN-POSTLINK: call fast <16 x float> @_Z3cosDv16_f(
		; GCN-PRELINK: call fast <16 x float> @_Z6sincosDv16_fPS_(
		; GCN-PRELINK: call fast <16 x float> @_Z6sincosDv16_fPU3AS5S_(
		; GCN-NATIVE: call fast <16 x float> @_Z10native_sinDv16_f(
		; GCN-NATIVE: call fast <16 x float> @_Z10native_cosDv16_f(
		define amdgpu_kernel void @test_sincos_v16(ptr addrspace(1) nocapture %a) {

Admin message