Commit f44beecb authored by Matt Arsenault's avatar Matt Arsenault
Browse files

AMDGPU: Try to use private version of sincos if available

The comment was out of date, the device libs build does provide all
the pointer overloads. An extremely pedantic interpretation of the
spec would suggest only the flat version exists, but the overloads do
exist in the implementation.

https://reviews.llvm.org/D156720
parent 9f369a4c
Loading
Loading
Loading
Loading
+21 −15
Original line number Diff line number Diff line
@@ -1065,18 +1065,18 @@ AMDGPULibCalls::insertSinCos(Value *Arg, FastMathFlags FMF, IRBuilder<> &B,
    B.SetCurrentDebugLocation(DL);
  }

  Value *P = Alloc;
  Type *PTy = Fsincos.getFunctionType()->getParamType(1);
  Type *CosPtrTy = Fsincos.getFunctionType()->getParamType(1);

  // The allocaInst allocates the memory in private address space. This need
  // to be bitcasted to point to the address space of cos pointer type.
  // to be addrspacecasted to point to the address space of cos pointer type.
  // In OpenCL 2.0 this is generic, while in 1.2 that is private.
  if (PTy->getPointerAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS)
    P = B.CreateAddrSpaceCast(Alloc, PTy);
  Value *CastAlloc = B.CreateAddrSpaceCast(Alloc, CosPtrTy);

  CallInst *SinCos = CreateCallEx2(B, Fsincos, Arg, P);
  CallInst *SinCos = CreateCallEx2(B, Fsincos, Arg, CastAlloc);

  // TODO: Is it worth trying to preserve the location for the cos calls for the
  // load?

  LoadInst *LoadCos = B.CreateLoad(Alloc->getAllocatedType(), Alloc);
  return {SinCos, LoadCos, SinCos};
}
@@ -1100,15 +1100,19 @@ bool AMDGPULibCalls::fold_sincos(FPMathOperator *FPOp, IRBuilder<> &B,
  Function *F = B.GetInsertBlock()->getParent();
  Module *M = F->getParent();

  // Merge the sin and cos.
  // Merge the sin and cos. For OpenCL 2.0, there may only be a generic pointer
  // implementation. Prefer the private form if available.
  AMDGPULibFunc SinCosLibFuncPrivate(AMDGPULibFunc::EI_SINCOS, fInfo);
  SinCosLibFuncPrivate.getLeads()[0].PtrKind =
      AMDGPULibFunc::getEPtrKindFromAddrSpace(AMDGPUAS::PRIVATE_ADDRESS);

  // for OpenCL 2.0 we have only generic implementation of sincos
  // function.
  // FIXME: This is not true anymore
  AMDGPULibFunc SinCosLibFunc(AMDGPULibFunc::EI_SINCOS, fInfo);
  SinCosLibFunc.getLeads()[0].PtrKind =
  AMDGPULibFunc SinCosLibFuncGeneric(AMDGPULibFunc::EI_SINCOS, fInfo);
  SinCosLibFuncGeneric.getLeads()[0].PtrKind =
      AMDGPULibFunc::getEPtrKindFromAddrSpace(AMDGPUAS::FLAT_ADDRESS);
  FunctionCallee FSinCos = getFunction(M, SinCosLibFunc);

  FunctionCallee FSinCosPrivate = getFunction(M, SinCosLibFuncPrivate);
  FunctionCallee FSinCosGeneric = getFunction(M, SinCosLibFuncGeneric);
  FunctionCallee FSinCos = FSinCosPrivate ? FSinCosPrivate : FSinCosGeneric;
  if (!FSinCos)
    return false;

@@ -1121,7 +1125,8 @@ bool AMDGPULibCalls::fold_sincos(FPMathOperator *FPOp, IRBuilder<> &B,

  StringRef SinName = isSin ? CI->getCalledFunction()->getName() : PairName;
  StringRef CosName = isSin ? PairName : CI->getCalledFunction()->getName();
  const std::string SinCosName = SinCosLibFunc.mangle();
  const std::string SinCosPrivateName = SinCosLibFuncPrivate.mangle();
  const std::string SinCosGenericName = SinCosLibFuncGeneric.mangle();

  // Intersect the two sets of flags.
  FastMathFlags FMF = FPOp->getFastMathFlags();
@@ -1144,7 +1149,8 @@ bool AMDGPULibCalls::fold_sincos(FPMathOperator *FPOp, IRBuilder<> &B,
      SinCalls.push_back(XI);
    else if (UCallee->getName() == CosName)
      CosCalls.push_back(XI);
    else if (UCallee->getName() == SinCosName)
    else if (UCallee->getName() == SinCosPrivateName ||
             UCallee->getName() == SinCosGenericName)
      SinCosCalls.push_back(XI);
    else
      Handled = false;
+12 −15
Original line number Diff line number Diff line
@@ -106,12 +106,11 @@ define void @sincos_f32(float %x, ptr addrspace(1) nocapture writeonly %sin_out,
; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) {
; CHECK-NEXT:  entry:
; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5)
; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr
; CHECK-NEXT:    [[TMP1:%.*]] = call contract float @_Z6sincosfPU3AS0f(float [[X]], ptr [[TMP0]])
; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4
; CHECK-NEXT:    store float [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 4
; CHECK-NEXT:    [[TMP0:%.*]] = call contract float @_Z6sincosfPU3AS5f(float [[X]], ptr addrspace(5) [[__SINCOS_]])
; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4
; CHECK-NEXT:    store float [[TMP0]], ptr addrspace(1) [[SIN_OUT]], align 4
; CHECK-NEXT:    [[CALL1:%.*]] = tail call contract float @_Z3cosf(float [[X]])
; CHECK-NEXT:    store float [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 4
; CHECK-NEXT:    store float [[TMP1]], ptr addrspace(1) [[COS_OUT]], align 4
; CHECK-NEXT:    ret void
;
entry:
@@ -127,12 +126,11 @@ define void @sincos_f32_value_is_same_constantfp(ptr addrspace(1) nocapture writ
; CHECK-SAME: (ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) {
; CHECK-NEXT:  entry:
; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5)
; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr
; CHECK-NEXT:    [[TMP1:%.*]] = call contract float @_Z6sincosfPU3AS0f(float 4.200000e+01, ptr [[TMP0]])
; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4
; CHECK-NEXT:    store float [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 4
; CHECK-NEXT:    [[TMP0:%.*]] = call contract float @_Z6sincosfPU3AS5f(float 4.200000e+01, ptr addrspace(5) [[__SINCOS_]])
; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4
; CHECK-NEXT:    store float [[TMP0]], ptr addrspace(1) [[SIN_OUT]], align 4
; CHECK-NEXT:    [[CALL1:%.*]] = tail call contract float @_Z3cosf(float 4.200000e+01)
; CHECK-NEXT:    store float [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 4
; CHECK-NEXT:    store float [[TMP1]], ptr addrspace(1) [[COS_OUT]], align 4
; CHECK-NEXT:    ret void
;
entry:
@@ -157,12 +155,11 @@ define void @sincos_v2f32(<2 x float> %x, ptr addrspace(1) nocapture writeonly %
; CHECK-SAME: (<2 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) {
; CHECK-NEXT:  entry:
; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca <2 x float>, align 8, addrspace(5)
; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr
; CHECK-NEXT:    [[TMP1:%.*]] = call contract <2 x float> @_Z6sincosDv2_fPU3AS0S_(<2 x float> [[X]], ptr [[TMP0]])
; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr addrspace(5) [[__SINCOS_]], align 8
; CHECK-NEXT:    store <2 x float> [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 8
; CHECK-NEXT:    [[TMP0:%.*]] = call contract <2 x float> @_Z6sincosDv2_fPU3AS5S_(<2 x float> [[X]], ptr addrspace(5) [[__SINCOS_]])
; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr addrspace(5) [[__SINCOS_]], align 8
; CHECK-NEXT:    store <2 x float> [[TMP0]], ptr addrspace(1) [[SIN_OUT]], align 8
; CHECK-NEXT:    [[CALL1:%.*]] = tail call contract <2 x float> @_Z3cosDv2_f(<2 x float> [[X]])
; CHECK-NEXT:    store <2 x float> [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 8
; CHECK-NEXT:    store <2 x float> [[TMP1]], ptr addrspace(1) [[COS_OUT]], align 8
; CHECK-NEXT:    ret void
;
entry:
+151 −189

File changed.

Preview size limit exceeded, changes collapsed.

+16 −20
Original line number Diff line number Diff line
@@ -14,12 +14,11 @@ define void @sincos_f32_nobuiltin(float noundef %x, ptr addrspace(1) nocapture n
; CHECK-SAME: (float noundef [[X:%.*]], ptr addrspace(1) nocapture noundef writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture noundef writeonly [[COS_OUT:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT:  entry:
; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5)
; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr
; CHECK-NEXT:    [[TMP1:%.*]] = call contract float @_Z6sincosfPU3AS0f(float [[X]], ptr [[TMP0]])
; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4
; CHECK-NEXT:    store float [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 4
; CHECK-NEXT:    [[TMP0:%.*]] = call contract float @_Z6sincosfPU3AS5f(float [[X]], ptr addrspace(5) [[__SINCOS_]])
; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4
; CHECK-NEXT:    store float [[TMP0]], ptr addrspace(1) [[SIN_OUT]], align 4
; CHECK-NEXT:    [[CALL1:%.*]] = tail call contract float @_Z3cosf(float noundef [[X]])
; CHECK-NEXT:    store float [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 4
; CHECK-NEXT:    store float [[TMP1]], ptr addrspace(1) [[COS_OUT]], align 4
; CHECK-NEXT:    ret void
;
entry:
@@ -36,12 +35,11 @@ define void @sincos_v2f32_nobuiltin(<2 x float> noundef %x, ptr addrspace(1) noc
; CHECK-SAME: (<2 x float> noundef [[X:%.*]], ptr addrspace(1) nocapture noundef writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture noundef writeonly [[COS_OUT:%.*]]) #[[ATTR0]] {
; CHECK-NEXT:  entry:
; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca <2 x float>, align 8, addrspace(5)
; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr
; CHECK-NEXT:    [[TMP1:%.*]] = call contract <2 x float> @_Z6sincosDv2_fPU3AS0S_(<2 x float> [[X]], ptr [[TMP0]])
; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr addrspace(5) [[__SINCOS_]], align 8
; CHECK-NEXT:    store <2 x float> [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 8
; CHECK-NEXT:    [[TMP0:%.*]] = call contract <2 x float> @_Z6sincosDv2_fPU3AS5S_(<2 x float> [[X]], ptr addrspace(5) [[__SINCOS_]])
; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr addrspace(5) [[__SINCOS_]], align 8
; CHECK-NEXT:    store <2 x float> [[TMP0]], ptr addrspace(1) [[SIN_OUT]], align 8
; CHECK-NEXT:    [[CALL1:%.*]] = tail call contract <2 x float> @_Z3cosDv2_f(<2 x float> noundef [[X]])
; CHECK-NEXT:    store <2 x float> [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 8
; CHECK-NEXT:    store <2 x float> [[TMP1]], ptr addrspace(1) [[COS_OUT]], align 8
; CHECK-NEXT:    ret void
;
entry:
@@ -57,12 +55,11 @@ define void @sincos_f32_no_builtins(float noundef %x, ptr addrspace(1) nocapture
; CHECK-SAME: (float noundef [[X:%.*]], ptr addrspace(1) nocapture noundef writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture noundef writeonly [[COS_OUT:%.*]]) #[[ATTR1:[0-9]+]] {
; CHECK-NEXT:  entry:
; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5)
; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr
; CHECK-NEXT:    [[TMP1:%.*]] = call contract float @_Z6sincosfPU3AS0f(float [[X]], ptr [[TMP0]])
; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4
; CHECK-NEXT:    store float [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 4
; CHECK-NEXT:    [[TMP0:%.*]] = call contract float @_Z6sincosfPU3AS5f(float [[X]], ptr addrspace(5) [[__SINCOS_]])
; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4
; CHECK-NEXT:    store float [[TMP0]], ptr addrspace(1) [[SIN_OUT]], align 4
; CHECK-NEXT:    [[CALL1:%.*]] = tail call contract float @_Z3cosf(float noundef [[X]])
; CHECK-NEXT:    store float [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 4
; CHECK-NEXT:    store float [[TMP1]], ptr addrspace(1) [[COS_OUT]], align 4
; CHECK-NEXT:    ret void
;
entry:
@@ -78,12 +75,11 @@ define void @sincos_v2f32_no_builtins(<2 x float> noundef %x, ptr addrspace(1) n
; CHECK-SAME: (<2 x float> noundef [[X:%.*]], ptr addrspace(1) nocapture noundef writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture noundef writeonly [[COS_OUT:%.*]]) #[[ATTR1]] {
; CHECK-NEXT:  entry:
; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca <2 x float>, align 8, addrspace(5)
; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr
; CHECK-NEXT:    [[TMP1:%.*]] = call contract <2 x float> @_Z6sincosDv2_fPU3AS0S_(<2 x float> [[X]], ptr [[TMP0]])
; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr addrspace(5) [[__SINCOS_]], align 8
; CHECK-NEXT:    store <2 x float> [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 8
; CHECK-NEXT:    [[TMP0:%.*]] = call contract <2 x float> @_Z6sincosDv2_fPU3AS5S_(<2 x float> [[X]], ptr addrspace(5) [[__SINCOS_]])
; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr addrspace(5) [[__SINCOS_]], align 8
; CHECK-NEXT:    store <2 x float> [[TMP0]], ptr addrspace(1) [[SIN_OUT]], align 8
; CHECK-NEXT:    [[CALL1:%.*]] = tail call contract <2 x float> @_Z3cosDv2_f(<2 x float> noundef [[X]])
; CHECK-NEXT:    store <2 x float> [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 8
; CHECK-NEXT:    store <2 x float> [[TMP1]], ptr addrspace(1) [[COS_OUT]], align 8
; CHECK-NEXT:    ret void
;
entry:
+6 −6
Original line number Diff line number Diff line
@@ -8,7 +8,7 @@
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos
; GCN-POSTLINK: call fast float @_Z3sinf(
; GCN-POSTLINK: call fast float @_Z3cosf(
; GCN-PRELINK: call fast float @_Z6sincosfPf(
; GCN-PRELINK: call fast float @_Z6sincosfPU3AS5f(
; GCN-NATIVE: call fast float @_Z10native_sinf(
; GCN-NATIVE: call fast float @_Z10native_cosf(
define amdgpu_kernel void @test_sincos(ptr addrspace(1) nocapture %a) {
@@ -29,7 +29,7 @@ declare float @_Z3cosf(float)
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v2
; GCN-POSTLINK: call fast <2 x float> @_Z3sinDv2_f(
; GCN-POSTLINK: call fast <2 x float> @_Z3cosDv2_f(
; GCN-PRELINK: call fast <2 x float> @_Z6sincosDv2_fPS_(
; GCN-PRELINK: call fast <2 x float> @_Z6sincosDv2_fPU3AS5S_(
; GCN-NATIVE: call fast <2 x float> @_Z10native_sinDv2_f(
; GCN-NATIVE: call fast <2 x float> @_Z10native_cosDv2_f(
define amdgpu_kernel void @test_sincos_v2(ptr addrspace(1) nocapture %a) {
@@ -50,7 +50,7 @@ declare <2 x float> @_Z3cosDv2_f(<2 x float>)
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v3
; GCN-POSTLINK: call fast <3 x float> @_Z3sinDv3_f(
; GCN-POSTLINK: call fast <3 x float> @_Z3cosDv3_f(
; GCN-PRELINK: call fast <3 x float> @_Z6sincosDv3_fPS_(
; GCN-PRELINK: call fast <3 x float> @_Z6sincosDv3_fPU3AS5S_(
; GCN-NATIVE: call fast <3 x float> @_Z10native_sinDv3_f(
; GCN-NATIVE: call fast <3 x float> @_Z10native_cosDv3_f(
define amdgpu_kernel void @test_sincos_v3(ptr addrspace(1) nocapture %a) {
@@ -74,7 +74,7 @@ declare <3 x float> @_Z3cosDv3_f(<3 x float>)
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v4
; GCN-POSTLINK: call fast <4 x float> @_Z3sinDv4_f(
; GCN-POSTLINK: call fast <4 x float> @_Z3cosDv4_f(
; GCN-PRELINK: call fast <4 x float> @_Z6sincosDv4_fPS_(
; GCN-PRELINK: call fast <4 x float> @_Z6sincosDv4_fPU3AS5S_(
; GCN-NATIVE: call fast <4 x float> @_Z10native_sinDv4_f(
; GCN-NATIVE: call fast <4 x float> @_Z10native_cosDv4_f(
define amdgpu_kernel void @test_sincos_v4(ptr addrspace(1) nocapture %a) {
@@ -95,7 +95,7 @@ declare <4 x float> @_Z3cosDv4_f(<4 x float>)
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v8
; GCN-POSTLINK: call fast <8 x float> @_Z3sinDv8_f(
; GCN-POSTLINK: call fast <8 x float> @_Z3cosDv8_f(
; GCN-PRELINK: call fast <8 x float> @_Z6sincosDv8_fPS_(
; GCN-PRELINK: call fast <8 x float> @_Z6sincosDv8_fPU3AS5S_(
; GCN-NATIVE: call fast <8 x float> @_Z10native_sinDv8_f(
; GCN-NATIVE: call fast <8 x float> @_Z10native_cosDv8_f(
define amdgpu_kernel void @test_sincos_v8(ptr addrspace(1) nocapture %a) {
@@ -116,7 +116,7 @@ declare <8 x float> @_Z3cosDv8_f(<8 x float>)
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v16
; GCN-POSTLINK: call fast <16 x float> @_Z3sinDv16_f(
; GCN-POSTLINK: call fast <16 x float> @_Z3cosDv16_f(
; GCN-PRELINK: call fast <16 x float> @_Z6sincosDv16_fPS_(
; GCN-PRELINK: call fast <16 x float> @_Z6sincosDv16_fPU3AS5S_(
; GCN-NATIVE: call fast <16 x float> @_Z10native_sinDv16_f(
; GCN-NATIVE: call fast <16 x float> @_Z10native_cosDv16_f(
define amdgpu_kernel void @test_sincos_v16(ptr addrspace(1) nocapture %a) {