Unverified Commit f93a697e authored by Michael Klemm's avatar Michael Klemm Committed by GitHub
Browse files

[libomptarget][OpenMP] Initial implementation of omp_target_memset() and...

[libomptarget][OpenMP] Initial implementation of omp_target_memset() and omp_target_memset_async() (#68706)

Implement a slow-path version of omp_target_memset*() 

There is a TODO to implement a fast path that uses an on-device
kernel instead of the host-based memory fill operation.  This may
require some additional plumbing to have kernels in libomptarget.so
parent 970e7456
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -312,6 +312,7 @@ int omp_target_memcpy_rect(void *Dst, const void *Src, size_t ElementSize,
                           const size_t *DstDimensions,
                           const size_t *SrcDimensions, int DstDevice,
                           int SrcDevice);
void *omp_target_memset(void *Ptr, int C, size_t N, int DeviceNum);
int omp_target_associate_ptr(const void *HostPtr, const void *DevicePtr,
                             size_t Size, size_t DeviceOffset, int DeviceNum);
int omp_target_disassociate_ptr(const void *HostPtr, int DeviceNum);
+108 −24
Original line number Diff line number Diff line
@@ -210,7 +210,7 @@ EXTERN int omp_target_memcpy(void *Dst, const void *Src, size_t Length,
}

// The helper function that calls omp_target_memcpy or omp_target_memcpy_rect
static int libomp_target_memcpy_async_helper(kmp_int32 Gtid, kmp_task_t *Task) {
static int libomp_target_memcpy_async_task(kmp_int32 Gtid, kmp_task_t *Task) {
  if (Task == nullptr)
    return OFFLOAD_FAIL;

@@ -241,47 +241,129 @@ static int libomp_target_memcpy_async_helper(kmp_int32 Gtid, kmp_task_t *Task) {
  return Rc;
}

// Allocate and launch helper task
static int libomp_helper_task_creation(TargetMemcpyArgsTy *Args,
                                       int DepObjCount,
static int libomp_target_memset_async_task(kmp_int32 Gtid, kmp_task_t *Task) {
  if (!Task)
    return OFFLOAD_FAIL;

  auto *Args = reinterpret_cast<TargetMemsetArgsTy *>(Task->shareds);
  if (!Args)
    return OFFLOAD_FAIL;

  // call omp_target_memset()
  omp_target_memset(Args->Ptr, Args->C, Args->N, Args->DeviceNum);

  delete Args;

  return OFFLOAD_SUCCESS;
}

static inline void
convertDepObjVector(llvm::SmallVector<kmp_depend_info_t> &Vec, int DepObjCount,
                    omp_depend_t *DepObjList) {
  for (int i = 0; i < DepObjCount; ++i) {
    omp_depend_t DepObj = DepObjList[i];
    Vec.push_back(*((kmp_depend_info_t *)DepObj));
  }
}

template <class T>
static inline int
libomp_helper_task_creation(T *Args, int (*Fn)(kmp_int32, kmp_task_t *),
                            int DepObjCount, omp_depend_t *DepObjList) {
  // Create global thread ID
  int Gtid = __kmpc_global_thread_num(nullptr);
  int (*Fn)(kmp_int32, kmp_task_t *) = &libomp_target_memcpy_async_helper;

  // Setup the hidden helper flags;
  // Setup the hidden helper flags
  kmp_int32 Flags = 0;
  kmp_tasking_flags_t *InputFlags = (kmp_tasking_flags_t *)&Flags;
  InputFlags->hidden_helper = 1;

  // Alloc helper task
  kmp_task_t *Ptr = __kmpc_omp_target_task_alloc(nullptr, Gtid, Flags,
                                                 sizeof(kmp_task_t), 0, Fn, -1);

  if (Ptr == nullptr) {
    // Task allocation failed, delete the argument object
  // Alloc the helper task
  kmp_task_t *Task = __kmpc_omp_target_task_alloc(
      nullptr, Gtid, Flags, sizeof(kmp_task_t), 0, Fn, -1);
  if (!Task) {
    delete Args;

    return OFFLOAD_FAIL;
  }

  // Setup the arguments passed to helper task
  Ptr->shareds = Args;
  // Setup the arguments for the helper task
  Task->shareds = Args;

  // Convert the type of depend objects
  // Convert types of depend objects
  llvm::SmallVector<kmp_depend_info_t> DepObjs;
  for (int i = 0; i < DepObjCount; i++) {
    omp_depend_t DepObj = DepObjList[i];
    DepObjs.push_back(*((kmp_depend_info_t *)DepObj));
  }
  convertDepObjVector(DepObjs, DepObjCount, DepObjList);

  // Launch the helper task
  int Rc = __kmpc_omp_task_with_deps(nullptr, Gtid, Ptr, DepObjCount,
  int Rc = __kmpc_omp_task_with_deps(nullptr, Gtid, Task, DepObjCount,
                                     DepObjs.data(), 0, nullptr);

  return Rc;
}

EXTERN void *omp_target_memset(void *Ptr, int ByteVal, size_t NumBytes,
                               int DeviceNum) {
  TIMESCOPE();
  DP("Call to omp_target_memset, device %d, device pointer %p, size %zu\n",
     DeviceNum, Ptr, NumBytes);

  // Behave as a no-op if N==0 or if Ptr is nullptr (as a useful implementation
  // of unspecified behavior, see OpenMP spec).
  if (!Ptr || NumBytes == 0) {
    return Ptr;
  }

  if (DeviceNum == omp_get_initial_device()) {
    DP("filling memory on host via memset");
    memset(Ptr, ByteVal, NumBytes); // ignore return value, memset() cannot fail
  } else {
    // TODO: replace the omp_target_memset() slow path with the fast path.
    // That will require the ability to execute a kernel from within
    // libomptarget.so (which we do not have at the moment).

    // This is a very slow path: create a filled array on the host and upload
    // it to the GPU device.
    int InitialDevice = omp_get_initial_device();
    void *Shadow = omp_target_alloc(NumBytes, InitialDevice);
    if (Shadow) {
      (void)memset(Shadow, ByteVal, NumBytes);
      (void)omp_target_memcpy(Ptr, Shadow, NumBytes, 0, 0, DeviceNum,
                              InitialDevice);
      (void)omp_target_free(Shadow, InitialDevice);
    } else {
      // If the omp_target_alloc has failed, let's just not do anything.
      // omp_target_memset does not have any good way to fail, so we
      // simply avoid a catastrophic failure of the process for now.
      DP("omp_target_memset failed to fill memory due to error with "
         "omp_target_alloc");
    }
  }

  DP("omp_target_memset returns %p\n", Ptr);
  return Ptr;
}

EXTERN void *omp_target_memset_async(void *Ptr, int ByteVal, size_t NumBytes,
                                     int DeviceNum, int DepObjCount,
                                     omp_depend_t *DepObjList) {
  DP("Call to omp_target_memset_async, device %d, device pointer %p, size %zu",
     DeviceNum, Ptr, NumBytes);

  // Behave as a no-op if N==0 or if Ptr is nullptr (as a useful implementation
  // of unspecified behavior, see OpenMP spec).
  if (!Ptr || NumBytes == 0)
    return Ptr;

  // Create the task object to deal with the async invocation
  auto *Args = new TargetMemsetArgsTy{Ptr, ByteVal, NumBytes, DeviceNum};

  // omp_target_memset_async() cannot fail via a return code, so ignore the
  // return code of the helper function
  (void)libomp_helper_task_creation(Args, &libomp_target_memset_async_task,
                                    DepObjCount, DepObjList);

  return Ptr;
}

EXTERN int omp_target_memcpy_async(void *Dst, const void *Src, size_t Length,
                                   size_t DstOffset, size_t SrcOffset,
                                   int DstDevice, int SrcDevice,
@@ -302,7 +384,8 @@ EXTERN int omp_target_memcpy_async(void *Dst, const void *Src, size_t Length,
      Dst, Src, Length, DstOffset, SrcOffset, DstDevice, SrcDevice);

  // Create and launch helper task
  int Rc = libomp_helper_task_creation(Args, DepObjCount, DepObjList);
  int Rc = libomp_helper_task_creation(Args, &libomp_target_memcpy_async_task,
                                       DepObjCount, DepObjList);

  DP("omp_target_memcpy_async returns %d\n", Rc);
  return Rc;
@@ -399,7 +482,8 @@ EXTERN int omp_target_memcpy_rect_async(
      DstDimensions, SrcDimensions, DstDevice, SrcDevice);

  // Create and launch helper task
  int Rc = libomp_helper_task_creation(Args, DepObjCount, DepObjList);
  int Rc = libomp_helper_task_creation(Args, &libomp_target_memcpy_async_task,
                                       DepObjCount, DepObjList);

  DP("omp_target_memcpy_rect_async returns %d\n", Rc);
  return Rc;
+2 −0
Original line number Diff line number Diff line
@@ -44,6 +44,8 @@ VERS1.0 {
    omp_target_memcpy_rect;
    omp_target_memcpy_async;
    omp_target_memcpy_rect_async;
    omp_target_memset;
    omp_target_memset_async;
    omp_target_associate_ptr;
    omp_target_disassociate_ptr;
    llvm_omp_target_alloc_host;
+11 −0
Original line number Diff line number Diff line
@@ -253,6 +253,17 @@ struct TargetMemcpyArgsTy {
        DstOffsets(DstOffsets), SrcOffsets(SrcOffsets),
        DstDimensions(DstDimensions), SrcDimensions(SrcDimensions){};
};

struct TargetMemsetArgsTy {
  // Common attributes of a memset operation
  void *Ptr;
  int C;
  size_t N;
  int DeviceNum;

  // no constructors defined, because this is a PoD
};

// Invalid GTID as defined by libomp; keep in sync
#define KMP_GTID_DNE (-2)
#ifdef __cplusplus
+45 −0
Original line number Diff line number Diff line
// RUN: %libomptarget-compile-and-run-generic

#include "stdio.h"
#include <omp.h>
#include <stdlib.h>

int main() {
  int d = omp_get_default_device();
  int id = omp_get_initial_device();
  int q[128], i;
  void *p;
  void *result;

  if (d < 0 || d >= omp_get_num_devices())
    d = id;

  p = omp_target_alloc(130 * sizeof(int), d);
  if (p == NULL)
    return 0;

  for (i = 0; i < 128; i++)
    q[i] = i;

  result = omp_target_memset(p, 0, 130 * sizeof(int), d);
  if (result != p) {
    abort();
  }

  int q2[128];
  for (i = 0; i < 128; ++i)
    q2[i] = i;
  if (omp_target_memcpy_async(q2, p, 128 * sizeof(int), 0, sizeof(int), id, d,
                              0, NULL))
    abort();

#pragma omp taskwait

  for (i = 0; i < 128; ++i)
    if (q2[i] != 0)
      abort();

  omp_target_free(p, d);

  return 0;
}
Loading