Commit 3de645ef authored by Johannes Doerfert's avatar Johannes Doerfert
Browse files

[OpenMP][NFC] Split the reduction buffer size into two components

Before we tracked the size of the teams reduction buffer in order to
allocate it at runtime per kernel launch. This patch splits the number
into two parts, the size of the reduction data (=all reduction
variables) and the (maximal) length of the buffer. This will allow us to
allocate less if we need less, e.g., if we have less teams than the
maximal length. It also allows us to move code from clangs codegen into
the runtime as we now know how large the reduction data is.
parent 921bd299
Loading
Loading
Loading
Loading
+27 −27
Original line number Diff line number Diff line
@@ -801,10 +801,12 @@ void CGOpenMPRuntimeGPU::emitKernelDeinit(CodeGenFunction &CGF,
  llvm::Type *LLVMReductionsBufferTy =
      CGM.getTypes().ConvertTypeForMem(StaticTy);
  const auto &DL = CGM.getModule().getDataLayout();
  uint64_t BufferSize =
  uint64_t ReductionDataSize =
      DL.getTypeAllocSize(LLVMReductionsBufferTy).getFixedValue();
  CGBuilderTy &Bld = CGF.Builder;
  OMPBuilder.createTargetDeinit(Bld, BufferSize);
  OMPBuilder.createTargetDeinit(Bld, ReductionDataSize,
                                C.getLangOpts().OpenMPCUDAReductionBufNum);
  TeamsReductions.clear();
}

void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
@@ -2828,15 +2830,25 @@ void CGOpenMPRuntimeGPU::emitReduction(
  assert((TeamsReduction || ParallelReduction) &&
         "Invalid reduction selection in emitReduction.");

  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap;
  llvm::SmallVector<const ValueDecl *, 4> PrivatesReductions(Privates.size());
  int Cnt = 0;
  for (const Expr *DRE : Privates) {
    PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl();
    ++Cnt;
  }

  ASTContext &C = CGM.getContext();
  const RecordDecl *ReductionRec = ::buildRecordForGlobalizedVars(
      CGM.getContext(), PrivatesReductions, std::nullopt, VarFieldMap, 1);

  // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
  // RedList, shuffle_reduce_func, interwarp_copy_func);
  // or
  // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
  llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
  llvm::Value *ThreadId = getThreadID(CGF, Loc);

  llvm::Value *Res;
  ASTContext &C = CGM.getContext();
  // 1. Build a list of reduction variables.
  // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
  auto Size = RHSExprs.size();
@@ -2878,19 +2890,17 @@ void CGOpenMPRuntimeGPU::emitReduction(
  llvm::Function *ReductionFn = emitReductionFunction(
      CGF.CurFn->getName(), Loc, CGF.ConvertTypeForMem(ReductionArrayTy),
      Privates, LHSExprs, RHSExprs, ReductionOps);
  llvm::Value *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy);
  llvm::Value *ReductionDataSize =
      CGF.getTypeSize(C.getRecordType(ReductionRec));
  ReductionDataSize =
      CGF.Builder.CreateSExtOrTrunc(ReductionDataSize, CGF.Int64Ty);
  llvm::Function *ShuffleAndReduceFn = emitShuffleAndReduceFunction(
      CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
  llvm::Value *InterWarpCopyFn =
      emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc);

  if (ParallelReduction) {
    llvm::Value *Args[] = {RTLoc,
                           ThreadId,
                           CGF.Builder.getInt32(RHSExprs.size()),
                           ReductionArrayTySize,
                           RL,
                           ShuffleAndReduceFn,
    llvm::Value *Args[] = {RTLoc, ReductionDataSize, RL, ShuffleAndReduceFn,
                           InterWarpCopyFn};

    Res = CGF.EmitRuntimeCall(
@@ -2899,37 +2909,27 @@ void CGOpenMPRuntimeGPU::emitReduction(
        Args);
  } else {
    assert(TeamsReduction && "expected teams reduction.");
    llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap;
    llvm::SmallVector<const ValueDecl *, 4> PrivatesReductions(Privates.size());
    int Cnt = 0;
    for (const Expr *DRE : Privates) {
      PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl();
      ++Cnt;
    }
    const RecordDecl *TeamReductionRec = ::buildRecordForGlobalizedVars(
        CGM.getContext(), PrivatesReductions, std::nullopt, VarFieldMap,
        C.getLangOpts().OpenMPCUDAReductionBufNum);
    TeamsReductions.push_back(TeamReductionRec);
    TeamsReductions.push_back(ReductionRec);
    auto *KernelTeamsReductionPtr = CGF.EmitRuntimeCall(
        OMPBuilder.getOrCreateRuntimeFunction(
            CGM.getModule(), OMPRTL___kmpc_reduction_get_fixed_buffer),
        {}, "_openmp_teams_reductions_buffer_$_$ptr");
    llvm::Value *GlobalToBufferCpyFn = ::emitListToGlobalCopyFunction(
        CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
        CGM, Privates, ReductionArrayTy, Loc, ReductionRec, VarFieldMap);
    llvm::Value *GlobalToBufferRedFn = ::emitListToGlobalReduceFunction(
        CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap,
        CGM, Privates, ReductionArrayTy, Loc, ReductionRec, VarFieldMap,
        ReductionFn);
    llvm::Value *BufferToGlobalCpyFn = ::emitGlobalToListCopyFunction(
        CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
        CGM, Privates, ReductionArrayTy, Loc, ReductionRec, VarFieldMap);
    llvm::Value *BufferToGlobalRedFn = ::emitGlobalToListReduceFunction(
        CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap,
        CGM, Privates, ReductionArrayTy, Loc, ReductionRec, VarFieldMap,
        ReductionFn);

    llvm::Value *Args[] = {
        RTLoc,
        ThreadId,
        KernelTeamsReductionPtr,
        CGF.Builder.getInt32(C.getLangOpts().OpenMPCUDAReductionBufNum),
        ReductionDataSize,
        RL,
        ShuffleAndReduceFn,
        InterWarpCopyFn,
+117 −135

File changed.

Preview size limit exceeded, changes collapsed.

+10 −14
Original line number Diff line number Diff line
@@ -329,13 +329,11 @@ void test() {
// CHECK1-NEXT:    [[TMP31:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
// CHECK1-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP31]], align 4, !tbaa [[TBAA14]]
// CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB3]], i32 [[TMP32]])
// CHECK1-NEXT:    [[TMP33:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
// CHECK1-NEXT:    [[TMP34:%.*]] = load i32, ptr [[TMP33]], align 4, !tbaa [[TBAA14]]
// CHECK1-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
// CHECK1-NEXT:    store ptr [[PARTIAL_SUM5]], ptr [[TMP35]], align 8
// CHECK1-NEXT:    [[TMP36:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP34]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func, ptr @_omp_reduction_inter_warp_copy_func)
// CHECK1-NEXT:    [[TMP37:%.*]] = icmp eq i32 [[TMP36]], 1
// CHECK1-NEXT:    br i1 [[TMP37]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
// CHECK1-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
// CHECK1-NEXT:    store ptr [[PARTIAL_SUM5]], ptr [[TMP33]], align 8
// CHECK1-NEXT:    [[TMP34:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(ptr @[[GLOB1]], i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func, ptr @_omp_reduction_inter_warp_copy_func)
// CHECK1-NEXT:    [[TMP35:%.*]] = icmp eq i32 [[TMP34]], 1
// CHECK1-NEXT:    br i1 [[TMP35]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
// CHECK1:       .omp.reduction.then:
// CHECK1-NEXT:    [[CALL21:%.*]] = call nonnull align 4 dereferenceable(8) ptr @_ZNSt7complexIfEpLIfEERS0_RKS_IT_E(ptr nonnull align 4 dereferenceable(8) [[TMP2]], ptr nonnull align 4 dereferenceable(8) [[PARTIAL_SUM5]]) #[[ATTR12]]
// CHECK1-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
@@ -822,13 +820,11 @@ void test() {
// CHECK1-NEXT:    [[TMP31:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
// CHECK1-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP31]], align 4, !tbaa [[TBAA14]]
// CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB3]], i32 [[TMP32]])
// CHECK1-NEXT:    [[TMP33:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
// CHECK1-NEXT:    [[TMP34:%.*]] = load i32, ptr [[TMP33]], align 4, !tbaa [[TBAA14]]
// CHECK1-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
// CHECK1-NEXT:    store ptr [[PARTIAL_SUM5]], ptr [[TMP35]], align 8
// CHECK1-NEXT:    [[TMP36:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP34]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func1, ptr @_omp_reduction_inter_warp_copy_func2)
// CHECK1-NEXT:    [[TMP37:%.*]] = icmp eq i32 [[TMP36]], 1
// CHECK1-NEXT:    br i1 [[TMP37]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
// CHECK1-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
// CHECK1-NEXT:    store ptr [[PARTIAL_SUM5]], ptr [[TMP33]], align 8
// CHECK1-NEXT:    [[TMP34:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(ptr @[[GLOB1]], i64 16, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func1, ptr @_omp_reduction_inter_warp_copy_func2)
// CHECK1-NEXT:    [[TMP35:%.*]] = icmp eq i32 [[TMP34]], 1
// CHECK1-NEXT:    br i1 [[TMP35]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
// CHECK1:       .omp.reduction.then:
// CHECK1-NEXT:    [[CALL21:%.*]] = call nonnull align 8 dereferenceable(16) ptr @_ZNSt7complexIdEpLIdEERS0_RKS_IT_E(ptr nonnull align 8 dereferenceable(16) [[TMP2]], ptr nonnull align 8 dereferenceable(16) [[PARTIAL_SUM5]]) #[[ATTR12]]
// CHECK1-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
+180 −198

File changed.

Preview size limit exceeded, changes collapsed.

+8 −10
Original line number Diff line number Diff line
@@ -146,17 +146,15 @@ int main()
// CHECK-NEXT:    store ptr [[TMP7]], ptr [[TMP]], align 8
// CHECK-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[TMP]], align 8
// CHECK-NEXT:    store double 1.000000e+01, ptr [[TMP8]], align 8
// CHECK-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
// CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4
// CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
// CHECK-NEXT:    store ptr [[E2]], ptr [[TMP11]], align 8
// CHECK-NEXT:    [[TMP12:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP10]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func, ptr @_omp_reduction_inter_warp_copy_func)
// CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[TMP12]], 1
// CHECK-NEXT:    br i1 [[TMP13]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
// CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
// CHECK-NEXT:    store ptr [[E2]], ptr [[TMP9]], align 8
// CHECK-NEXT:    [[TMP10:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(ptr @[[GLOB1]], i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func, ptr @_omp_reduction_inter_warp_copy_func)
// CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[TMP10]], 1
// CHECK-NEXT:    br i1 [[TMP11]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
// CHECK:       .omp.reduction.then:
// CHECK-NEXT:    [[TMP14:%.*]] = load double, ptr [[ARRAYIDX]], align 8
// CHECK-NEXT:    [[TMP15:%.*]] = load double, ptr [[E2]], align 8
// CHECK-NEXT:    [[ADD:%.*]] = fadd double [[TMP14]], [[TMP15]]
// CHECK-NEXT:    [[TMP12:%.*]] = load double, ptr [[ARRAYIDX]], align 8
// CHECK-NEXT:    [[TMP13:%.*]] = load double, ptr [[E2]], align 8
// CHECK-NEXT:    [[ADD:%.*]] = fadd double [[TMP12]], [[TMP13]]
// CHECK-NEXT:    store double [[ADD]], ptr [[ARRAYIDX]], align 8
// CHECK-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
// CHECK:       .omp.reduction.done:
Loading