Commit d77c6205 authored by Changpeng Fang's avatar Changpeng Fang
Browse files

[clang][AMDGPU]: Don't use byval for struct arguments in function ABI

Summary:
  Byval requires allocating additional stack space, and always requires an implicit copy to be inserted in codegen,
where it can be difficult to optimize. In this work, we use byref/IndirectAliased promotion method instead of
byval with the implicit copy semantics.

Reviewers:
  arsenm

Differential Revision:
  https://reviews.llvm.org/D155986
parent 9e3d9c9e
Loading
Loading
Loading
Loading
+4 −0
Original line number Diff line number Diff line
@@ -193,6 +193,10 @@ Target Specific Changes

AMDGPU Support
^^^^^^^^^^^^^^
- Use pass-by-reference (byref) in stead of pass-by-value (byval) for struct
  arguments in C ABI. Callee is responsible for allocating stack memory and
  copying the value of the struct if modified. Note that AMDGPU backend still
  supports byval for struct arguments.

X86 Support
^^^^^^^^^^^
+8 −4
Original line number Diff line number Diff line
@@ -2156,7 +2156,8 @@ static bool DetermineNoUndef(QualType QTy, CodeGenTypes &Types,
                             const llvm::DataLayout &DL, const ABIArgInfo &AI,
                             bool CheckCoerce = true) {
  llvm::Type *Ty = Types.ConvertTypeForMem(QTy);
  if (AI.getKind() == ABIArgInfo::Indirect)
  if (AI.getKind() == ABIArgInfo::Indirect ||
      AI.getKind() == ABIArgInfo::IndirectAliased)
    return true;
  if (AI.getKind() == ABIArgInfo::Extend)
    return true;
@@ -5126,12 +5127,15 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
          auto LV = I->getKnownLValue();
          auto AS = LV.getAddressSpace();

          if (!ArgInfo.getIndirectByVal() ||
          bool isByValOrRef =
              ArgInfo.isIndirectAliased() || ArgInfo.getIndirectByVal();

          if (!isByValOrRef ||
              (LV.getAlignment() < getContext().getTypeAlignInChars(I->Ty))) {
            NeedCopy = true;
          }
          if (!getLangOpts().OpenCL) {
            if ((ArgInfo.getIndirectByVal() &&
            if ((isByValOrRef &&
                (AS != LangAS::Default &&
                 AS != CGM.getASTAllocaAddressSpace()))) {
              NeedCopy = true;
@@ -5139,7 +5143,7 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
          }
          // For OpenCL even if RV is located in default or alloca address space
          // we don't want to perform address space cast for it.
          else if ((ArgInfo.getIndirectByVal() &&
          else if ((isByValOrRef &&
                    Addr.getType()->getAddressSpace() != IRFuncTy->
                      getParamType(FirstIRArg)->getPointerAddressSpace())) {
            NeedCopy = true;
+6 −0
Original line number Diff line number Diff line
@@ -248,6 +248,12 @@ ABIArgInfo AMDGPUABIInfo::classifyArgumentType(QualType Ty,
        return ABIArgInfo::getDirect();
      }
    }

    // Use pass-by-reference in stead of pass-by-value for struct arguments in
    // function ABI.
    return ABIArgInfo::getIndirectAliased(
        getContext().getTypeAlignInChars(Ty),
        getContext().getTargetAddressSpace(LangAS::opencl_private));
  }

  // Otherwise just do the default thing.
+4 −4
Original line number Diff line number Diff line
@@ -9,14 +9,14 @@ struct A {
  float *p;
};

// AMDGCN: define{{.*}} amdgpu_kernel void @_Z6kernel1A(ptr addrspace(4) byref(%struct.A) align 8 %{{.+}})
// AMDGCN: define{{.*}} amdgpu_kernel void @_Z6kernel1A(ptr addrspace(4) noundef byref(%struct.A) align 8 %{{.+}})
// NVPTX: define{{.*}} void @_Z6kernel1A(ptr noundef byval(%struct.A) align 8 %x)
__global__ void kernel(A x) {
}

class Kernel {
public:
  // AMDGCN: define{{.*}} amdgpu_kernel void @_ZN6Kernel12memberKernelE1A(ptr addrspace(4) byref(%struct.A) align 8 %{{.+}})
  // AMDGCN: define{{.*}} amdgpu_kernel void @_ZN6Kernel12memberKernelE1A(ptr addrspace(4) noundef byref(%struct.A) align 8 %{{.+}})
  // NVPTX: define{{.*}} void @_ZN6Kernel12memberKernelE1A(ptr noundef byval(%struct.A) align 8 %x)
  static __global__ void memberKernel(A x){}
  template<typename T> static __global__ void templateMemberKernel(T x) {}
@@ -30,11 +30,11 @@ void launch(void*);

void test() {
  Kernel K;
  // AMDGCN: define{{.*}} amdgpu_kernel void @_Z14templateKernelI1AEvT_(ptr addrspace(4) byref(%struct.A) align 8 %{{.+}}
  // AMDGCN: define{{.*}} amdgpu_kernel void @_Z14templateKernelI1AEvT_(ptr addrspace(4) noundef byref(%struct.A) align 8 %{{.+}}
  // NVPTX: define{{.*}} void @_Z14templateKernelI1AEvT_(ptr noundef byval(%struct.A) align 8 %x)
  launch((void*)templateKernel<A>);

  // AMDGCN: define{{.*}} amdgpu_kernel void @_ZN6Kernel20templateMemberKernelI1AEEvT_(ptr addrspace(4) byref(%struct.A) align 8 %{{.+}}
  // AMDGCN: define{{.*}} amdgpu_kernel void @_ZN6Kernel20templateMemberKernelI1AEEvT_(ptr addrspace(4) noundef byref(%struct.A) align 8 %{{.+}}
  // NVPTX: define{{.*}} void @_ZN6Kernel20templateMemberKernelI1AEEvT_(ptr noundef byval(%struct.A) align 8 %x)
  launch((void*)Kernel::templateMemberKernel<A>);
}
+10 −9
Original line number Diff line number Diff line
@@ -19,14 +19,13 @@ void func_with_ref_arg(A &a);
void func_with_ref_arg(B &b);

// CHECK-LABEL: @_Z22func_with_indirect_arg1A(
// CHECK-SAME:  ptr addrspace(5) noundef [[ARG:%.*]])
// CHECK-NEXT:  entry:
// CHECK-NEXT:    [[INDIRECT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT:    [[A_INDIRECT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT:    [[P:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT:    [[INDIRECT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[INDIRECT_ADDR]] to ptr
// CHECK-NEXT:    [[A_INDIRECT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_INDIRECT_ADDR]] to ptr
// CHECK-NEXT:    [[P_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[P]] to ptr
// CHECK-NEXT:    store ptr addrspace(5) [[ARG]], ptr [[INDIRECT_ADDR_ASCAST]]
// CHECK-NEXT:    [[A_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A:%.*]] to ptr
// CHECK-NEXT:    store ptr addrspace(5) [[A:%.*]], ptr [[A_INDIRECT_ADDR_ASCAST]], align 8
// CHECK-NEXT:    [[A_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A]] to ptr
// CHECK-NEXT:    store ptr [[A_ASCAST]], ptr [[P_ASCAST]], align 8
// CHECK-NEXT:    ret void
//
@@ -73,10 +72,12 @@ void test_indirect_arg_global() {

// CHECK-LABEL: @_Z19func_with_byval_arg1B(
// CHECK-NEXT:  entry:
// CHECK-NEXT:    [[COERCE:%.*]] = alloca [[CLASS_B:%.*]], align 4, addrspace(5)
// CHECK-NEXT:    [[P:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT:    [[B:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr
// CHECK-NEXT:    [[P_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[P]] to ptr
// CHECK-NEXT:    [[B_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B:%.*]] to ptr
// CHECK-NEXT:    store ptr [[B_ASCAST]], ptr [[P_ASCAST]], align 8
// CHECK-NEXT:    call void @llvm.memcpy.p0.p5.i64(ptr align 4 [[B]], ptr addrspace(5) align 4 [[TMP0:%.*]], i64 400, i1 false)
// CHECK-NEXT:    store ptr [[B]], ptr [[P_ASCAST]], align 8
// CHECK-NEXT:    ret void
//
void func_with_byval_arg(B b) {
@@ -91,7 +92,7 @@ void func_with_byval_arg(B b) {
// CHECK-NEXT:    [[AGG_TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[AGG_TMP]] to ptr
// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[AGG_TMP_ASCAST]], ptr align 4 [[B_ASCAST]], i64 400, i1 false)
// CHECK-NEXT:    [[AGG_TMP_ASCAST_ASCAST:%.*]] = addrspacecast ptr [[AGG_TMP_ASCAST]] to ptr addrspace(5)
// CHECK-NEXT:    call void @_Z19func_with_byval_arg1B(ptr addrspace(5) noundef byval([[CLASS_B]]) align 4 [[AGG_TMP_ASCAST_ASCAST]])
// CHECK-NEXT:    call void @_Z19func_with_byval_arg1B(ptr addrspace(5) noundef byref([[CLASS_B]]) align 4 [[AGG_TMP_ASCAST_ASCAST]])
// CHECK-NEXT:    call void @_Z17func_with_ref_argR1B(ptr noundef nonnull align 4 dereferenceable(400) [[B_ASCAST]])
// CHECK-NEXT:    ret void
//
@@ -107,7 +108,7 @@ void test_byval_arg_auto() {
// CHECK-NEXT:    [[AGG_TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[AGG_TMP]] to ptr
// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[AGG_TMP_ASCAST]], ptr align 4 addrspacecast (ptr addrspace(1) @g_b to ptr), i64 400, i1 false)
// CHECK-NEXT:    [[AGG_TMP_ASCAST_ASCAST:%.*]] = addrspacecast ptr [[AGG_TMP_ASCAST]] to ptr addrspace(5)
// CHECK-NEXT:    call void @_Z19func_with_byval_arg1B(ptr addrspace(5) noundef byval([[CLASS_B]]) align 4 [[AGG_TMP_ASCAST_ASCAST]])
// CHECK-NEXT:    call void @_Z19func_with_byval_arg1B(ptr addrspace(5) noundef byref([[CLASS_B]]) align 4 [[AGG_TMP_ASCAST_ASCAST]])
// CHECK-NEXT:    call void @_Z17func_with_ref_argR1B(ptr noundef nonnull align 4 dereferenceable(400) addrspacecast (ptr addrspace(1) @g_b to ptr))
// CHECK-NEXT:    ret void
//
Loading