Loading include/dca/linalg/matrix.hpp +15 −0 Original line number Diff line number Diff line Loading @@ -209,6 +209,9 @@ public: template <DeviceType rhs_device_name> void set(const Matrix<ScalarType, rhs_device_name>& rhs, int thread_id, int stream_id); template <DeviceType rhs_device_name> void set(const Matrix<ScalarType, rhs_device_name>& rhs, const util::GpuStream& stream); // Asynchronous assignment. template <DeviceType rhs_device_name> void setAsync(const Matrix<ScalarType, rhs_device_name>& rhs, const util::GpuStream& stream); Loading Loading @@ -429,6 +432,18 @@ void Matrix<ScalarType, device_name>::set(const Matrix<ScalarType, rhs_device_na stream_id); } template <typename ScalarType, DeviceType device_name> template <DeviceType rhs_device_name> void Matrix<ScalarType, device_name>::set(const Matrix<ScalarType, rhs_device_name>& rhs, const util::GpuStream& stream) { resize(rhs.size_); util::memoryCopyAsync(data_, leadingDimension(), rhs.data_, rhs.leadingDimension(), size_, stream); cudaEvent_t set_event; checkRC(cudaEventCreateWithFlags(&set_event, cudaEventBlockingSync)); checkRC(cudaEventRecord(set_event, stream)); checkRC(cudaEventSynchronize(set_event)); } template <typename ScalarType, DeviceType device_name> template <DeviceType rhs_device_name> void Matrix<ScalarType, device_name>::setAsync(const Matrix<ScalarType, rhs_device_name>& rhs, Loading include/dca/linalg/util/gpu_stream.hpp +23 −3 Original line number Diff line number Diff line Loading @@ -30,10 +30,29 @@ class GpuStream { public: GpuStream() { checkRC(cudaStreamCreate(&stream_)); owning_ = true; } GpuStream(const GpuStream& other) = delete; GpuStream& operator=(const GpuStream& other) = delete; GpuStream(const cudaStream_t& stream) { stream_ = stream; owning_ = false; } GpuStream(const GpuStream& other) { stream_ = other.stream_; owning_ = false; } /** simple assignment does not take possesion of the cuda stream */ GpuStream& operator=(const GpuStream& other) { if (owning_ && stream_) checkRC(cudaStreamDestroy(stream_)); stream_ = other.stream_; owning_ = false; return *this; } GpuStream(GpuStream&& other) noexcept { swap(other); Loading @@ -58,7 +77,7 @@ public: } ~GpuStream() { if (stream_) if (owning_ && stream_) checkRC(cudaStreamDestroy(stream_)); } Loading @@ -72,6 +91,7 @@ public: private: cudaStream_t stream_ = nullptr; bool owning_ = false; }; #else // DCA_HAVE_GPU Loading include/dca/linalg/util/memory.hpp +14 −0 Original line number Diff line number Diff line Loading @@ -50,6 +50,11 @@ struct Memory<CPU> { static void setToZeroAsync(ScalarType* ptr, size_t size, const GpuStream& /*s*/) { setToZero(ptr, size); } template <typename ScalarType> static void setToZero(ScalarType* ptr, size_t size, const GpuStream& /*s*/) { setToZero(ptr, size); } }; #ifdef DCA_HAVE_GPU Loading Loading @@ -77,6 +82,15 @@ struct Memory<GPU> { static void setToZeroAsync(ScalarType* ptr, size_t size, const GpuStream& stream) { cudaMemsetAsync(ptr, 0, size * sizeof(ScalarType), stream); } template <typename ScalarType> static void setToZero(ScalarType* ptr, size_t size, const GpuStream& stream) { checkRC(cudaMemsetAsync(ptr, 0, size * sizeof(ScalarType), stream)); cudaEvent_t zero_event; checkRC(cudaEventCreateWithFlags(&zero_event, cudaEventBlockingSync)); checkRC(cudaEventRecord(zero_event, stream)); checkRC(cudaEventSynchronize(zero_event)); } }; #endif // DCA_HAVE_GPU Loading include/dca/linalg/vector.hpp +12 −0 Original line number Diff line number Diff line Loading @@ -108,6 +108,7 @@ public: void setAsync(const Container& rhs, const util::GpuStream& stream); void setToZeroAsync(const util::GpuStream& stream); void setToZero(const util::GpuStream& stream); template <class Container> void setAsync(const Container& rhs, int thred_id, int stream_id = 0); Loading Loading @@ -324,6 +325,17 @@ void Vector<ScalarType, device_name, Allocator>::setToZeroAsync(const util::GpuS #endif } template <typename ScalarType, DeviceType device_name, class Allocator> void Vector<ScalarType, device_name, Allocator>::setToZero(const util::GpuStream& stream [[maybe_unused]]) { dca::linalg::util::Memory<device_name>::setToZero(data_, size_, stream); } // template <typename ScalarType, DeviceType device_name, class Allocator> // void Vector<ScalarType, device_name, Allocator>::setToZero(const util::GpuStream& stream [[maybe_unused]]) { // // TODO: implement in copy.hpp. // dca::linalg::util::memory<device_name>::setToZero(data_, size_, stream); // } template <typename ScalarType, DeviceType device_name, class Allocator> template <class Container> void Vector<ScalarType, device_name, Allocator>::setAsync(const Container& rhs, const int thread_id, Loading include/dca/parallel/util/call_once_per_loop.hpp +4 −3 Original line number Diff line number Diff line Loading @@ -26,7 +26,7 @@ namespace util { struct OncePerLoopFlag { OncePerLoopFlag() : loop_done(-1) {} std::atomic<int> loop_done; int loop_done; dca::parallel::thread_traits::mutex_type mutex; }; Loading @@ -37,6 +37,8 @@ struct OncePerLoopFlag { template <class F, class... Args> void callOncePerLoop(OncePerLoopFlag& flag, const int loop_id, F&& f, Args&&... args) { std::scoped_lock<dca::parallel::thread_traits::mutex_type> lock(flag.mutex); const int currently_done = flag.loop_done; if (loop_id < 0) Loading @@ -47,10 +49,9 @@ void callOncePerLoop(OncePerLoopFlag& flag, const int loop_id, F&& f, Args&&... else if (loop_id > currently_done + 1 && currently_done != -1) throw(std::logic_error("Loop id called out of order.")); std::unique_lock<dca::parallel::thread_traits::mutex_type> lock(flag.mutex); // Check if flag.loop_done changed before locking the mutex. if (loop_id <= flag.loop_done) return; throw std::runtime_error("I don't think this how lock objects work."); // Run the task. f(args...); Loading Loading
include/dca/linalg/matrix.hpp +15 −0 Original line number Diff line number Diff line Loading @@ -209,6 +209,9 @@ public: template <DeviceType rhs_device_name> void set(const Matrix<ScalarType, rhs_device_name>& rhs, int thread_id, int stream_id); template <DeviceType rhs_device_name> void set(const Matrix<ScalarType, rhs_device_name>& rhs, const util::GpuStream& stream); // Asynchronous assignment. template <DeviceType rhs_device_name> void setAsync(const Matrix<ScalarType, rhs_device_name>& rhs, const util::GpuStream& stream); Loading Loading @@ -429,6 +432,18 @@ void Matrix<ScalarType, device_name>::set(const Matrix<ScalarType, rhs_device_na stream_id); } template <typename ScalarType, DeviceType device_name> template <DeviceType rhs_device_name> void Matrix<ScalarType, device_name>::set(const Matrix<ScalarType, rhs_device_name>& rhs, const util::GpuStream& stream) { resize(rhs.size_); util::memoryCopyAsync(data_, leadingDimension(), rhs.data_, rhs.leadingDimension(), size_, stream); cudaEvent_t set_event; checkRC(cudaEventCreateWithFlags(&set_event, cudaEventBlockingSync)); checkRC(cudaEventRecord(set_event, stream)); checkRC(cudaEventSynchronize(set_event)); } template <typename ScalarType, DeviceType device_name> template <DeviceType rhs_device_name> void Matrix<ScalarType, device_name>::setAsync(const Matrix<ScalarType, rhs_device_name>& rhs, Loading
include/dca/linalg/util/gpu_stream.hpp +23 −3 Original line number Diff line number Diff line Loading @@ -30,10 +30,29 @@ class GpuStream { public: GpuStream() { checkRC(cudaStreamCreate(&stream_)); owning_ = true; } GpuStream(const GpuStream& other) = delete; GpuStream& operator=(const GpuStream& other) = delete; GpuStream(const cudaStream_t& stream) { stream_ = stream; owning_ = false; } GpuStream(const GpuStream& other) { stream_ = other.stream_; owning_ = false; } /** simple assignment does not take possesion of the cuda stream */ GpuStream& operator=(const GpuStream& other) { if (owning_ && stream_) checkRC(cudaStreamDestroy(stream_)); stream_ = other.stream_; owning_ = false; return *this; } GpuStream(GpuStream&& other) noexcept { swap(other); Loading @@ -58,7 +77,7 @@ public: } ~GpuStream() { if (stream_) if (owning_ && stream_) checkRC(cudaStreamDestroy(stream_)); } Loading @@ -72,6 +91,7 @@ public: private: cudaStream_t stream_ = nullptr; bool owning_ = false; }; #else // DCA_HAVE_GPU Loading
include/dca/linalg/util/memory.hpp +14 −0 Original line number Diff line number Diff line Loading @@ -50,6 +50,11 @@ struct Memory<CPU> { static void setToZeroAsync(ScalarType* ptr, size_t size, const GpuStream& /*s*/) { setToZero(ptr, size); } template <typename ScalarType> static void setToZero(ScalarType* ptr, size_t size, const GpuStream& /*s*/) { setToZero(ptr, size); } }; #ifdef DCA_HAVE_GPU Loading Loading @@ -77,6 +82,15 @@ struct Memory<GPU> { static void setToZeroAsync(ScalarType* ptr, size_t size, const GpuStream& stream) { cudaMemsetAsync(ptr, 0, size * sizeof(ScalarType), stream); } template <typename ScalarType> static void setToZero(ScalarType* ptr, size_t size, const GpuStream& stream) { checkRC(cudaMemsetAsync(ptr, 0, size * sizeof(ScalarType), stream)); cudaEvent_t zero_event; checkRC(cudaEventCreateWithFlags(&zero_event, cudaEventBlockingSync)); checkRC(cudaEventRecord(zero_event, stream)); checkRC(cudaEventSynchronize(zero_event)); } }; #endif // DCA_HAVE_GPU Loading
include/dca/linalg/vector.hpp +12 −0 Original line number Diff line number Diff line Loading @@ -108,6 +108,7 @@ public: void setAsync(const Container& rhs, const util::GpuStream& stream); void setToZeroAsync(const util::GpuStream& stream); void setToZero(const util::GpuStream& stream); template <class Container> void setAsync(const Container& rhs, int thred_id, int stream_id = 0); Loading Loading @@ -324,6 +325,17 @@ void Vector<ScalarType, device_name, Allocator>::setToZeroAsync(const util::GpuS #endif } template <typename ScalarType, DeviceType device_name, class Allocator> void Vector<ScalarType, device_name, Allocator>::setToZero(const util::GpuStream& stream [[maybe_unused]]) { dca::linalg::util::Memory<device_name>::setToZero(data_, size_, stream); } // template <typename ScalarType, DeviceType device_name, class Allocator> // void Vector<ScalarType, device_name, Allocator>::setToZero(const util::GpuStream& stream [[maybe_unused]]) { // // TODO: implement in copy.hpp. // dca::linalg::util::memory<device_name>::setToZero(data_, size_, stream); // } template <typename ScalarType, DeviceType device_name, class Allocator> template <class Container> void Vector<ScalarType, device_name, Allocator>::setAsync(const Container& rhs, const int thread_id, Loading
include/dca/parallel/util/call_once_per_loop.hpp +4 −3 Original line number Diff line number Diff line Loading @@ -26,7 +26,7 @@ namespace util { struct OncePerLoopFlag { OncePerLoopFlag() : loop_done(-1) {} std::atomic<int> loop_done; int loop_done; dca::parallel::thread_traits::mutex_type mutex; }; Loading @@ -37,6 +37,8 @@ struct OncePerLoopFlag { template <class F, class... Args> void callOncePerLoop(OncePerLoopFlag& flag, const int loop_id, F&& f, Args&&... args) { std::scoped_lock<dca::parallel::thread_traits::mutex_type> lock(flag.mutex); const int currently_done = flag.loop_done; if (loop_id < 0) Loading @@ -47,10 +49,9 @@ void callOncePerLoop(OncePerLoopFlag& flag, const int loop_id, F&& f, Args&&... else if (loop_id > currently_done + 1 && currently_done != -1) throw(std::logic_error("Loop id called out of order.")); std::unique_lock<dca::parallel::thread_traits::mutex_type> lock(flag.mutex); // Check if flag.loop_done changed before locking the mutex. if (loop_id <= flag.loop_done) return; throw std::runtime_error("I don't think this how lock objects work."); // Run the task. f(args...); Loading