Unverified Commit 3c39478e authored by Sergio Afonso's avatar Sergio Afonso Committed by GitHub
Browse files

[MLIR][OpenMP] Support allocations of device shared memory (#150924)

This patch updates the allocation of some reduction and private
variables within target regions to use device shared memory rather than
private memory. This is a prerequisite to produce working Generic
kernels containing parallel regions.

In particular, the following situations result in the usage of device
shared memory (only when compiling for the target device if they are
placed inside of a target region representing a Generic kernel):
- Reduction variables on `teams` constructs.
- Private variables on `teams` and `distribute` constructs that are
reduced or used inside of a `parallel` region.

Currently, there is no support for delayed privatization on `teams`
constructs, so private variables on these constructs won't currently be
affected. When support is added, if it uses the existing
`allocatePrivateVars` and `cleanupPrivateVars` functions, usage of
device shared memory will be introduced automatically.
parent 7b62dd1a
Loading
Loading
Loading
Loading
+168 −63
Original line number Diff line number Diff line
@@ -1111,12 +1111,63 @@ struct DeferredStore {
};
} // namespace

/// Check whether allocations for the given operation might potentially have to
/// be done in device shared memory. That means we're compiling for a offloading
/// target, the operation is an `omp::TargetOp` or nested inside of one and that
/// target region represents a Generic (non-SPMD) kernel.
///
/// This represents a necessary but not sufficient set of conditions to use
/// device shared memory in place of regular allocas. For some variables, the
/// associated OpenMP construct or their uses might also need to be taken into
/// account.
static bool
mightAllocInDeviceSharedMemory(Operation &op,
                               const llvm::OpenMPIRBuilder &ompBuilder) {
  if (!ompBuilder.Config.isTargetDevice())
    return false;

  auto targetOp = dyn_cast<omp::TargetOp>(op);
  if (!targetOp)
    targetOp = op.getParentOfType<omp::TargetOp>();

  return targetOp &&
         targetOp.getKernelExecFlags(targetOp.getInnermostCapturedOmpOp()) ==
             omp::TargetExecMode::generic;
}

/// Check whether the entry block argument representing the private copy of a
/// variable in an OpenMP construct must be allocated in device shared memory,
/// based on what the uses of that copy are.
///
/// This must only be called if a previous call to
/// \c mightAllocInDeviceSharedMemory has already returned \c true for the
/// operation that owns the specified block argument.
static bool mustAllocPrivateVarInDeviceSharedMemory(BlockArgument value) {
  Operation *parentOp = value.getOwner()->getParentOp();
  auto targetOp = dyn_cast<omp::TargetOp>(parentOp);
  if (!targetOp)
    targetOp = parentOp->getParentOfType<omp::TargetOp>();
  assert(targetOp && "expected a parent omp.target operation");

  for (auto *user : value.getUsers()) {
    if (auto parallelOp = dyn_cast<omp::ParallelOp>(user)) {
      if (llvm::is_contained(parallelOp.getReductionVars(), value))
        return true;
    } else if (auto parallelOp = user->getParentOfType<omp::ParallelOp>()) {
      if (parentOp->isProperAncestor(parallelOp))
        return true;
    }
  }

  return false;
}

/// Allocate space for privatized reduction variables.
/// `deferredStores` contains information to create store operations which needs
/// to be inserted after all allocas
template <typename T>
static LogicalResult
allocReductionVars(T loop, ArrayRef<BlockArgument> reductionArgs,
allocReductionVars(T op, ArrayRef<BlockArgument> reductionArgs,
                   llvm::IRBuilderBase &builder,
                   LLVM::ModuleTranslation &moduleTranslation,
                   const llvm::OpenMPIRBuilder::InsertPointTy &allocaIP,
@@ -1128,10 +1179,14 @@ allocReductionVars(T loop, ArrayRef<BlockArgument> reductionArgs,
  llvm::IRBuilderBase::InsertPointGuard guard(builder);
  builder.SetInsertPoint(allocaIP.getBlock()->getTerminator());

  llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
  bool useDeviceSharedMem =
      isa<omp::TeamsOp>(op) && mightAllocInDeviceSharedMemory(*op, *ompBuilder);

  // delay creating stores until after all allocas
  deferredStores.reserve(loop.getNumReductionVars());
  deferredStores.reserve(op.getNumReductionVars());

  for (std::size_t i = 0; i < loop.getNumReductionVars(); ++i) {
  for (std::size_t i = 0; i < op.getNumReductionVars(); ++i) {
    Region &allocRegion = reductionDecls[i].getAllocRegion();
    if (isByRefs[i]) {
      if (allocRegion.empty())
@@ -1140,7 +1195,7 @@ allocReductionVars(T loop, ArrayRef<BlockArgument> reductionArgs,
      SmallVector<llvm::Value *, 1> phis;
      if (failed(inlineConvertOmpRegions(allocRegion, "omp.reduction.alloc",
                                         builder, moduleTranslation, &phis)))
        return loop.emitError(
        return op.emitError(
            "failed to inline `alloc` region of `omp.declare_reduction`");

      assert(phis.size() == 1 && "expected one allocation to be yielded");
@@ -1148,33 +1203,43 @@ allocReductionVars(T loop, ArrayRef<BlockArgument> reductionArgs,

      // Allocate reduction variable (which is a pointer to the real reduction
      // variable allocated in the inlined region)
      llvm::Value *var = builder.CreateAlloca(
          moduleTranslation.convertType(reductionDecls[i].getType()));

      llvm::Type *ptrTy = builder.getPtrTy();
      llvm::Value *castVar =
          builder.CreatePointerBitCastOrAddrSpaceCast(var, ptrTy);
      llvm::Type *varTy =
          moduleTranslation.convertType(reductionDecls[i].getType());
      llvm::Value *var;
      if (useDeviceSharedMem) {
        var = ompBuilder->createOMPAllocShared(builder, varTy);
      } else {
        var = builder.CreateAlloca(varTy);
        var = builder.CreatePointerBitCastOrAddrSpaceCast(var, ptrTy);
      }

      llvm::Value *castPhi =
          builder.CreatePointerBitCastOrAddrSpaceCast(phis[0], ptrTy);

      deferredStores.emplace_back(castPhi, castVar);
      deferredStores.emplace_back(castPhi, var);

      privateReductionVariables[i] = castVar;
      privateReductionVariables[i] = var;
      moduleTranslation.mapValue(reductionArgs[i], castPhi);
      reductionVariableMap.try_emplace(loop.getReductionVars()[i], castPhi);
      reductionVariableMap.try_emplace(op.getReductionVars()[i], castPhi);
    } else {
      assert(allocRegion.empty() &&
             "allocaction is implicit for by-val reduction");
      llvm::Value *var = builder.CreateAlloca(
          moduleTranslation.convertType(reductionDecls[i].getType()));

      llvm::Type *ptrTy = builder.getPtrTy();
      llvm::Value *castVar =
          builder.CreatePointerBitCastOrAddrSpaceCast(var, ptrTy);
      llvm::Type *varTy =
          moduleTranslation.convertType(reductionDecls[i].getType());
      llvm::Value *var;
      if (useDeviceSharedMem) {
        var = ompBuilder->createOMPAllocShared(builder, varTy);
      } else {
        var = builder.CreateAlloca(varTy);
        var = builder.CreatePointerBitCastOrAddrSpaceCast(var, ptrTy);
      }

      moduleTranslation.mapValue(reductionArgs[i], castVar);
      privateReductionVariables[i] = castVar;
      reductionVariableMap.try_emplace(loop.getReductionVars()[i], castVar);
      moduleTranslation.mapValue(reductionArgs[i], var);
      privateReductionVariables[i] = var;
      reductionVariableMap.try_emplace(op.getReductionVars()[i], var);
    }
  }

@@ -1246,6 +1311,10 @@ initReductionVars(OP op, ArrayRef<BlockArgument> reductionArgs,
  if (op.getNumReductionVars() == 0)
    return success();

  llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
  bool useDeviceSharedMem =
      isa<omp::TeamsOp>(op) && mightAllocInDeviceSharedMemory(*op, *ompBuilder);

  llvm::BasicBlock *initBlock = splitBB(builder, true, "omp.reduction.init");
  auto allocaIP = llvm::IRBuilderBase::InsertPoint(
      latestAllocaBlock, latestAllocaBlock->getTerminator()->getIterator());
@@ -1260,8 +1329,12 @@ initReductionVars(OP op, ArrayRef<BlockArgument> reductionArgs,
      // TODO: remove after all users of by-ref are updated to use the alloc
      // region: Allocate reduction variable (which is a pointer to the real
      // reduciton variable allocated in the inlined region)
      byRefVars[i] = builder.CreateAlloca(
          moduleTranslation.convertType(reductionDecls[i].getType()));
      llvm::Type *varTy =
          moduleTranslation.convertType(reductionDecls[i].getType());
      if (useDeviceSharedMem)
        byRefVars[i] = ompBuilder->createOMPAllocShared(builder, varTy);
      else
        byRefVars[i] = builder.CreateAlloca(varTy);
    }
  }

@@ -1483,10 +1556,20 @@ static LogicalResult createReductionsAndCleanup(
                  [](omp::DeclareReductionOp reductionDecl) {
                    return &reductionDecl.getCleanupRegion();
                  });
  return inlineOmpRegionCleanup(reductionRegions, privateReductionVariables,
                                moduleTranslation, builder,
  LogicalResult result = inlineOmpRegionCleanup(
      reductionRegions, privateReductionVariables, moduleTranslation, builder,
      "omp.reduction.cleanup");
  return success();

  bool useDeviceSharedMem =
      isa<omp::TeamsOp>(op) && mightAllocInDeviceSharedMemory(*op, *ompBuilder);
  if (useDeviceSharedMem) {
    for (auto [var, reductionDecl] :
         llvm::zip_equal(privateReductionVariables, reductionDecls))
      ompBuilder->createOMPFreeShared(
          builder, var, moduleTranslation.convertType(reductionDecl.getType()));
  }

  return result;
}

static ArrayRef<bool> getIsByRef(std::optional<ArrayRef<bool>> attr) {
@@ -1643,8 +1726,9 @@ initPrivateVars(llvm::IRBuilderBase &builder,
/// Allocate and initialize delayed private variables. Returns the basic block
/// which comes after all of these allocations. llvm::Value * for each of these
/// private variables are populated in llvmPrivateVars.
template <typename T>
static llvm::Expected<llvm::BasicBlock *>
allocatePrivateVars(llvm::IRBuilderBase &builder,
allocatePrivateVars(T op, llvm::IRBuilderBase &builder,
                    LLVM::ModuleTranslation &moduleTranslation,
                    PrivateVarsInfo &privateVarsInfo,
                    const llvm::OpenMPIRBuilder::InsertPointTy &allocaIP,
@@ -1667,6 +1751,10 @@ allocatePrivateVars(llvm::IRBuilderBase &builder,
  llvm::DataLayout dataLayout = builder.GetInsertBlock()->getDataLayout();
  llvm::BasicBlock *afterAllocas = allocaTerminator->getSuccessor(0);

  llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
  bool mightUseDeviceSharedMem =
      isa<omp::TeamsOp, omp::DistributeOp>(*op) &&
      mightAllocInDeviceSharedMemory(*op, *ompBuilder);
  unsigned int allocaAS =
      moduleTranslation.getLLVMModule()->getDataLayout().getAllocaAddrSpace();
  unsigned int defaultAS = moduleTranslation.getLLVMModule()
@@ -1679,11 +1767,17 @@ allocatePrivateVars(llvm::IRBuilderBase &builder,
    llvm::Type *llvmAllocType =
        moduleTranslation.convertType(privDecl.getType());
    builder.SetInsertPoint(allocaIP.getBlock()->getTerminator());
    llvm::Value *llvmPrivateVar = builder.CreateAlloca(
    llvm::Value *llvmPrivateVar = nullptr;
    if (mightUseDeviceSharedMem &&
        mustAllocPrivateVarInDeviceSharedMemory(blockArg)) {
      llvmPrivateVar = ompBuilder->createOMPAllocShared(builder, llvmAllocType);
    } else {
      llvmPrivateVar = builder.CreateAlloca(
          llvmAllocType, /*ArraySize=*/nullptr, "omp.private.alloc");
      if (allocaAS != defaultAS)
      llvmPrivateVar = builder.CreateAddrSpaceCast(llvmPrivateVar,
                                                   builder.getPtrTy(defaultAS));
        llvmPrivateVar = builder.CreateAddrSpaceCast(
            llvmPrivateVar, builder.getPtrTy(defaultAS));
    }

    privateVarsInfo.llvmVars.push_back(llvmPrivateVar);
  }
@@ -1791,24 +1885,41 @@ static LogicalResult copyFirstPrivateVars(
                              mappedPrivateVars);
}

template <typename T>
static LogicalResult
cleanupPrivateVars(llvm::IRBuilderBase &builder,
cleanupPrivateVars(T op, llvm::IRBuilderBase &builder,
                   LLVM::ModuleTranslation &moduleTranslation, Location loc,
                   SmallVectorImpl<llvm::Value *> &llvmPrivateVars,
                   SmallVectorImpl<omp::PrivateClauseOp> &privateDecls) {
                   PrivateVarsInfo &privateVarsInfo) {
  // private variable deallocation
  SmallVector<Region *> privateCleanupRegions;
  llvm::transform(privateDecls, std::back_inserter(privateCleanupRegions),
  llvm::transform(privateVarsInfo.privatizers,
                  std::back_inserter(privateCleanupRegions),
                  [](omp::PrivateClauseOp privatizer) {
                    return &privatizer.getDeallocRegion();
                  });

  if (failed(inlineOmpRegionCleanup(
          privateCleanupRegions, llvmPrivateVars, moduleTranslation, builder,
          "omp.private.dealloc", /*shouldLoadCleanupRegionArg=*/false)))
  if (failed(inlineOmpRegionCleanup(privateCleanupRegions,
                                    privateVarsInfo.llvmVars, moduleTranslation,
                                    builder, "omp.private.dealloc",
                                    /*shouldLoadCleanupRegionArg=*/false)))
    return mlir::emitError(loc, "failed to inline `dealloc` region of an "
                                "`omp.private` op in");

  llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
  bool mightUseDeviceSharedMem =
      isa<omp::TeamsOp, omp::DistributeOp>(*op) &&
      mightAllocInDeviceSharedMemory(*op, *ompBuilder);
  for (auto [privDecl, llvmPrivVar, blockArg] :
       llvm::zip_equal(privateVarsInfo.privatizers, privateVarsInfo.llvmVars,
                       privateVarsInfo.blockArgs)) {
    if (mightUseDeviceSharedMem &&
        mustAllocPrivateVarInDeviceSharedMemory(blockArg)) {
      ompBuilder->createOMPFreeShared(
          builder, llvmPrivVar,
          moduleTranslation.convertType(privDecl.getType()));
    }
  }

  return success();
}

@@ -2894,9 +3005,8 @@ convertOmpTaskOp(omp::TaskOp taskOp, llvm::IRBuilderBase &builder,

    builder.SetInsertPoint(continuationBlockOrError.get()->getTerminator());

    if (failed(cleanupPrivateVars(builder, moduleTranslation, taskOp.getLoc(),
                                  privateVarsInfo.llvmVars,
                                  privateVarsInfo.privatizers)))
    if (failed(cleanupPrivateVars(taskOp, builder, moduleTranslation,
                                  taskOp.getLoc(), privateVarsInfo)))
      return llvm::make_error<PreviouslyReportedError>();

    // Free heap allocated task context structure at the end of the task.
@@ -3309,9 +3419,8 @@ convertOmpTaskloopContextOp(omp::TaskloopContextOp contextOp,
    // handled transparently by how these are passed to the structure passed
    // into the outlined function. When the task is duplicated, that structure
    // is duplicated too.
    if (failed(cleanupPrivateVars(builder, moduleTranslation,
                                  contextOp.getLoc(), privateVarsInfo.llvmVars,
                                  privateVarsInfo.privatizers)))
    if (failed(cleanupPrivateVars(contextOp, builder, moduleTranslation,
                                  contextOp.getLoc(), privateVarsInfo)))
      return llvm::make_error<PreviouslyReportedError>();
    // Similarly, the task context structure freed inside the task is the
    // per-task copy after task duplication.
@@ -3537,7 +3646,7 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
      wsloopOp.getNumReductionVars());

  llvm::Expected<llvm::BasicBlock *> afterAllocas = allocatePrivateVars(
      builder, moduleTranslation, privateVarsInfo, allocaIP);
      wsloopOp, builder, moduleTranslation, privateVarsInfo, allocaIP);
  if (handleError(afterAllocas, opInst).failed())
    return failure();

@@ -3686,9 +3795,8 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
          /*isTeamsReduction=*/false)))
    return failure();

  return cleanupPrivateVars(builder, moduleTranslation, wsloopOp.getLoc(),
                            privateVarsInfo.llvmVars,
                            privateVarsInfo.privatizers);
  return cleanupPrivateVars(wsloopOp, builder, moduleTranslation,
                            wsloopOp.getLoc(), privateVarsInfo);
}

/// Converts the OpenMP parallel operation to LLVM IR.
@@ -3716,7 +3824,7 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
  auto bodyGenCB = [&](InsertPointTy allocaIP,
                       InsertPointTy codeGenIP) -> llvm::Error {
    llvm::Expected<llvm::BasicBlock *> afterAllocas = allocatePrivateVars(
        builder, moduleTranslation, privateVarsInfo, allocaIP);
        opInst, builder, moduleTranslation, privateVarsInfo, allocaIP);
    if (handleError(afterAllocas, *opInst).failed())
      return llvm::make_error<PreviouslyReportedError>();

@@ -3833,9 +3941,8 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
      return llvm::createStringError(
          "failed to inline `cleanup` region of `omp.declare_reduction`");

    if (failed(cleanupPrivateVars(builder, moduleTranslation, opInst.getLoc(),
                                  privateVarsInfo.llvmVars,
                                  privateVarsInfo.privatizers)))
    if (failed(cleanupPrivateVars(opInst, builder, moduleTranslation,
                                  opInst.getLoc(), privateVarsInfo)))
      return llvm::make_error<PreviouslyReportedError>();

    // If we could be performing cancellation, add the cancellation barrier on
@@ -3918,7 +4025,7 @@ convertOmpSimd(Operation &opInst, llvm::IRBuilderBase &builder,
      findAllocaInsertPoint(builder, moduleTranslation);

  llvm::Expected<llvm::BasicBlock *> afterAllocas = allocatePrivateVars(
      builder, moduleTranslation, privateVarsInfo, allocaIP);
      simdOp, builder, moduleTranslation, privateVarsInfo, allocaIP);
  if (handleError(afterAllocas, opInst).failed())
    return failure();

@@ -4099,9 +4206,8 @@ convertOmpSimd(Operation &opInst, llvm::IRBuilderBase &builder,
                                    "omp.reduction.cleanup")))
    return failure();

  return cleanupPrivateVars(builder, moduleTranslation, simdOp.getLoc(),
                            privateVarsInfo.llvmVars,
                            privateVarsInfo.privatizers);
  return cleanupPrivateVars(simdOp, builder, moduleTranslation, simdOp.getLoc(),
                            privateVarsInfo);
}

/// Converts an OpenMP loop nest into LLVM IR using OpenMPIRBuilder.
@@ -6362,8 +6468,8 @@ convertOmpDistribute(Operation &opInst, llvm::IRBuilderBase &builder,
    builder.restoreIP(codeGenIP);
    PrivateVarsInfo privVarsInfo(distributeOp);

    llvm::Expected<llvm::BasicBlock *> afterAllocas =
        allocatePrivateVars(builder, moduleTranslation, privVarsInfo, allocaIP);
    llvm::Expected<llvm::BasicBlock *> afterAllocas = allocatePrivateVars(
        distributeOp, builder, moduleTranslation, privVarsInfo, allocaIP);
    if (handleError(afterAllocas, opInst).failed())
      return llvm::make_error<PreviouslyReportedError>();

@@ -6418,9 +6524,8 @@ convertOmpDistribute(Operation &opInst, llvm::IRBuilderBase &builder,
      if (!wsloopIP)
        return wsloopIP.takeError();
    }
    if (failed(cleanupPrivateVars(builder, moduleTranslation,
                                  distributeOp.getLoc(), privVarsInfo.llvmVars,
                                  privVarsInfo.privatizers)))
    if (failed(cleanupPrivateVars(distributeOp, builder, moduleTranslation,
                                  distributeOp.getLoc(), privVarsInfo)))
      return llvm::make_error<PreviouslyReportedError>();

    return llvm::Error::success();
@@ -7196,8 +7301,8 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
    PrivateVarsInfo privateVarsInfo(targetOp);

    llvm::Expected<llvm::BasicBlock *> afterAllocas =
        allocatePrivateVars(builder, moduleTranslation, privateVarsInfo,
                            allocaIP, &mappedPrivateVars);
        allocatePrivateVars(targetOp, builder, moduleTranslation,
                            privateVarsInfo, allocaIP, &mappedPrivateVars);

    if (failed(handleError(afterAllocas, *targetOp)))
      return llvm::make_error<PreviouslyReportedError>();
+81 −0
Original line number Diff line number Diff line
// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s

// This test checks that, when compiling for an offloading target, device shared
// memory will be used in place of allocas for certain private variables.

module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memory_space", 5 : ui32>>, llvm.data_layout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8", llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_gpu = true, omp.is_target_device = true} {
  omp.private {type = private} @privatizer : i32
  omp.declare_reduction @reduction : i32 init {
  ^bb0(%arg0: i32):
    %0 = llvm.mlir.constant(0 : i32) : i32
    omp.yield(%0 : i32)
  } combiner {
  ^bb0(%arg0: i32, %arg1: i32):
    %0 = llvm.add %arg0, %arg1 : i32
    omp.yield(%0 : i32)
  }
  llvm.func @main() {
    %c0 = llvm.mlir.constant(1 : i64) : i64
    %1 = llvm.alloca %c0 x i32 {bindc_name = "x"} : (i64) -> !llvm.ptr<5>
    %2 = llvm.addrspacecast %1 : !llvm.ptr<5> to !llvm.ptr
    %3 = llvm.alloca %c0 x i32 {bindc_name = "y"} : (i64) -> !llvm.ptr<5>
    %4 = llvm.addrspacecast %3 : !llvm.ptr<5> to !llvm.ptr
    %5 = llvm.alloca %c0 x i32 {bindc_name = "z"} : (i64) -> !llvm.ptr<5>
    %6 = llvm.addrspacecast %5 : !llvm.ptr<5> to !llvm.ptr
    %7 = omp.map.info var_ptr(%2 : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = "x"}
    %8 = omp.map.info var_ptr(%4 : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = "y"}
    %9 = omp.map.info var_ptr(%6 : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = "z"}
    omp.target map_entries(%7 -> %arg0, %8 -> %arg1, %9 -> %arg2 : !llvm.ptr, !llvm.ptr, !llvm.ptr) {
      %11 = llvm.mlir.constant(10000 : i32) : i32
      %12 = llvm.mlir.constant(1 : i32) : i32
      omp.teams reduction(@reduction %arg0 -> %arg3 : !llvm.ptr) {
        omp.distribute private(@privatizer %arg1 -> %arg4, @privatizer %arg2 -> %arg5 : !llvm.ptr, !llvm.ptr) {
          omp.loop_nest (%arg6) : i32 = (%12) to (%11) inclusive step (%12) {
            llvm.store %arg6, %arg4 : i32, !llvm.ptr
            %13 = llvm.load %arg3 : !llvm.ptr -> i32
            %14 = llvm.add %13, %12 : i32
            llvm.store %14, %arg3 : i32, !llvm.ptr
            omp.parallel reduction(@reduction %arg5 -> %arg7 : !llvm.ptr) {
              %15 = llvm.load %arg4 : !llvm.ptr -> i32
              %16 = llvm.load %arg7 : !llvm.ptr -> i32
              %17 = llvm.add %15, %16 : i32
              llvm.store %17, %arg7 : i32, !llvm.ptr
              omp.terminator
            }
            omp.yield
          }
        }
        omp.terminator
      }
      omp.terminator
    }
    // CHECK: call i32 @__kmpc_target_init
    // CHECK: call void @[[OUTLINED_TARGET:__omp_offloading_[A-Za-z0-9_.]*]]

    // CHECK: define internal void @[[OUTLINED_TARGET]]
    // CHECK: %[[X_PRIV:.*]] = call align 8 ptr @__kmpc_alloc_shared(i64 4)
    // CHECK: %[[GEP_X:.*]] = getelementptr { {{.*}} }, ptr addrspace(5) %structArg
    // CHECK-NEXT: store ptr %[[X_PRIV]], ptr addrspace(5) %[[GEP_X]]
    // CHECK-NEXT: call void @[[OUTLINED_TEAMS:__omp_offloading_[A-Za-z0-9_.]*]](ptr %structArg.ascast)

    // CHECK: [[REDUCE_FINALIZE_BB:reduce\.finalize.*]]:
    // CHECK-NEXT: call void @__kmpc_free_shared(ptr %[[X_PRIV]], i64 4)

    // CHECK: define internal void @[[OUTLINED_TEAMS]]
    // CHECK: %[[Y_PRIV:.*]] = call align 8 ptr @__kmpc_alloc_shared(i64 4)
    // CHECK: %[[Z_PRIV:.*]] = call align 8 ptr @__kmpc_alloc_shared(i64 4)

    // %[[GEP_Y:.*]] = getelementptr { {{.*}} }, ptr addrspace(5) %structArg
    // store ptr %[[Y_PRIV]], ptr addrspace(5) %[[GEP_Y]], align 8
    // %[[GEP_Z:.*]] = getelementptr { {{.*}} }, ptr addrspace(5) %structArg
    // store ptr %[[Z_PRIV]], ptr addrspace(5) %[[GEP_Z]], align 8

    // CHECK: call void @__kmpc_free_shared(ptr %[[Y_PRIV]], i64 4)
    // CHECK-NEXT: call void @__kmpc_free_shared(ptr %[[Z_PRIV]], i64 4)
    // CHECK-NEXT: br label %[[EXIT_BB:.*]]

    // CHECK: [[EXIT_BB]]:
    // CHECK-NEXT: ret void
    llvm.return
  }
}