Commit 29fa096f authored by Doak, Peter W.'s avatar Doak, Peter W.
Browse files

partial fix for race condition with init of tp_accumulator_gpu

parent 33e3b9a5
Loading
Loading
Loading
Loading
+15 −0
Original line number Diff line number Diff line
@@ -209,6 +209,9 @@ public:
  template <DeviceType rhs_device_name>
  void set(const Matrix<ScalarType, rhs_device_name>& rhs, int thread_id, int stream_id);

  template <DeviceType rhs_device_name>
  void set(const Matrix<ScalarType, rhs_device_name>& rhs, const util::GpuStream& stream);

  // Asynchronous assignment.
  template <DeviceType rhs_device_name>
  void setAsync(const Matrix<ScalarType, rhs_device_name>& rhs, const util::GpuStream& stream);
@@ -429,6 +432,18 @@ void Matrix<ScalarType, device_name>::set(const Matrix<ScalarType, rhs_device_na
                   stream_id);
}

template <typename ScalarType, DeviceType device_name>
template <DeviceType rhs_device_name>
void Matrix<ScalarType, device_name>::set(const Matrix<ScalarType, rhs_device_name>& rhs,
                                          const util::GpuStream& stream) {
  resize(rhs.size_);
  util::memoryCopyAsync(data_, leadingDimension(), rhs.data_, rhs.leadingDimension(), size_, stream);
  cudaEvent_t set_event;
  checkRC(cudaEventCreateWithFlags(&set_event, cudaEventBlockingSync));
  checkRC(cudaEventRecord(set_event, stream));
  checkRC(cudaEventSynchronize(set_event));
}

template <typename ScalarType, DeviceType device_name>
template <DeviceType rhs_device_name>
void Matrix<ScalarType, device_name>::setAsync(const Matrix<ScalarType, rhs_device_name>& rhs,
+23 −3
Original line number Diff line number Diff line
@@ -30,10 +30,29 @@ class GpuStream {
public:
  GpuStream() {
    checkRC(cudaStreamCreate(&stream_));
    owning_ = true;
  }

  GpuStream(const GpuStream& other) = delete;
  GpuStream& operator=(const GpuStream& other) = delete;
  GpuStream(const cudaStream_t& stream) { 
    stream_ = stream;
    owning_ = false;
  }

  GpuStream(const GpuStream& other) {
    stream_ = other.stream_;
    owning_ = false;
  }

  /** simple assignment does not take possesion of the cuda stream
   */
  GpuStream& operator=(const GpuStream& other)
  {
    if (owning_ && stream_)
      checkRC(cudaStreamDestroy(stream_));
    stream_ = other.stream_;
    owning_ = false;
    return *this;
  }

  GpuStream(GpuStream&& other) noexcept {
    swap(other);
@@ -58,7 +77,7 @@ public:
  }

  ~GpuStream() {
    if (stream_)
    if (owning_ && stream_)
      checkRC(cudaStreamDestroy(stream_));
  }

@@ -72,6 +91,7 @@ public:

private:
  cudaStream_t stream_ = nullptr;
  bool owning_ = false;
};

#else  // DCA_HAVE_GPU
+14 −0
Original line number Diff line number Diff line
@@ -50,6 +50,11 @@ struct Memory<CPU> {
  static void setToZeroAsync(ScalarType* ptr, size_t size, const GpuStream& /*s*/) {
    setToZero(ptr, size);
  }

  template <typename ScalarType>
  static void setToZero(ScalarType* ptr, size_t size, const GpuStream& /*s*/) {
    setToZero(ptr, size);
  }
};

#ifdef DCA_HAVE_GPU
@@ -77,6 +82,15 @@ struct Memory<GPU> {
  static void setToZeroAsync(ScalarType* ptr, size_t size, const GpuStream& stream) {
    cudaMemsetAsync(ptr, 0, size * sizeof(ScalarType), stream);
  }

  template <typename ScalarType>
  static void setToZero(ScalarType* ptr, size_t size, const GpuStream& stream) {
    checkRC(cudaMemsetAsync(ptr, 0, size * sizeof(ScalarType), stream));
    cudaEvent_t zero_event;
    checkRC(cudaEventCreateWithFlags(&zero_event, cudaEventBlockingSync));
    checkRC(cudaEventRecord(zero_event, stream));
    checkRC(cudaEventSynchronize(zero_event));
  }
};
#endif  // DCA_HAVE_GPU

+12 −0
Original line number Diff line number Diff line
@@ -108,6 +108,7 @@ public:
  void setAsync(const Container& rhs, const util::GpuStream& stream);

  void setToZeroAsync(const util::GpuStream& stream);
  void setToZero(const util::GpuStream& stream);

  template <class Container>
  void setAsync(const Container& rhs, int thred_id, int stream_id = 0);
@@ -324,6 +325,17 @@ void Vector<ScalarType, device_name, Allocator>::setToZeroAsync(const util::GpuS
#endif
}

template <typename ScalarType, DeviceType device_name, class Allocator>
void Vector<ScalarType, device_name, Allocator>::setToZero(const util::GpuStream& stream [[maybe_unused]]) {
  dca::linalg::util::Memory<device_name>::setToZero(data_, size_, stream);
}

// template <typename ScalarType, DeviceType device_name, class Allocator>
// void Vector<ScalarType, device_name, Allocator>::setToZero(const util::GpuStream& stream [[maybe_unused]]) {
//   // TODO: implement in copy.hpp.
//   dca::linalg::util::memory<device_name>::setToZero(data_, size_, stream);
// }

template <typename ScalarType, DeviceType device_name, class Allocator>
template <class Container>
void Vector<ScalarType, device_name, Allocator>::setAsync(const Container& rhs, const int thread_id,
+4 −3
Original line number Diff line number Diff line
@@ -26,7 +26,7 @@ namespace util {
struct OncePerLoopFlag {
  OncePerLoopFlag() : loop_done(-1) {}

  std::atomic<int> loop_done;
  int loop_done;
  dca::parallel::thread_traits::mutex_type mutex;
};

@@ -37,6 +37,8 @@ struct OncePerLoopFlag {
template <class F, class... Args>
void callOncePerLoop(OncePerLoopFlag& flag, const int loop_id, F&& f, Args&&... args)
{
    std::scoped_lock<dca::parallel::thread_traits::mutex_type> lock(flag.mutex);

    const int currently_done = flag.loop_done;

    if (loop_id < 0)
@@ -47,10 +49,9 @@ void callOncePerLoop(OncePerLoopFlag& flag, const int loop_id, F&& f, Args&&...
    else if (loop_id > currently_done + 1 && currently_done != -1)
        throw(std::logic_error("Loop id called out of order."));

    std::unique_lock<dca::parallel::thread_traits::mutex_type> lock(flag.mutex);
    // Check if flag.loop_done changed before locking the mutex.
    if (loop_id <= flag.loop_done)
        return;
      throw std::runtime_error("I don't think this how lock objects work.");

    // Run the task.
    f(args...);
Loading