partial fix for race condition with init of tp_accumulator_gpu (29fa096f) · Commits · NDIP / Tool Sources / Direct-Geometry Spectroscopy / DCA / DCA Main

include/dca/linalg/matrix.hpp

+15 −0

Original line number	Diff line number	Diff line
		@@ -209,6 +209,9 @@ public:
		template <DeviceType rhs_device_name>
		void set(const Matrix<ScalarType, rhs_device_name>& rhs, int thread_id, int stream_id);

		template <DeviceType rhs_device_name>
		void set(const Matrix<ScalarType, rhs_device_name>& rhs, const util::GpuStream& stream);

		// Asynchronous assignment.
		template <DeviceType rhs_device_name>
		void setAsync(const Matrix<ScalarType, rhs_device_name>& rhs, const util::GpuStream& stream);
		@@ -429,6 +432,18 @@ void Matrix<ScalarType, device_name>::set(const Matrix<ScalarType, rhs_device_na
		stream_id);
		}

		template <typename ScalarType, DeviceType device_name>
		template <DeviceType rhs_device_name>
		void Matrix<ScalarType, device_name>::set(const Matrix<ScalarType, rhs_device_name>& rhs,
		const util::GpuStream& stream) {
		resize(rhs.size_);
		util::memoryCopyAsync(data_, leadingDimension(), rhs.data_, rhs.leadingDimension(), size_, stream);
		cudaEvent_t set_event;
		checkRC(cudaEventCreateWithFlags(&set_event, cudaEventBlockingSync));
		checkRC(cudaEventRecord(set_event, stream));
		checkRC(cudaEventSynchronize(set_event));
		}

		template <typename ScalarType, DeviceType device_name>
		template <DeviceType rhs_device_name>
		void Matrix<ScalarType, device_name>::setAsync(const Matrix<ScalarType, rhs_device_name>& rhs,

+23 −3

Original line number	Diff line number	Diff line
		@@ -30,10 +30,29 @@ class GpuStream {
		public:
		GpuStream() {
		checkRC(cudaStreamCreate(&stream_));
		owning_ = true;
		}

		GpuStream(const GpuStream& other) = delete;
		GpuStream& operator=(const GpuStream& other) = delete;
		GpuStream(const cudaStream_t& stream) {
		stream_ = stream;
		owning_ = false;
		}

		GpuStream(const GpuStream& other) {
		stream_ = other.stream_;
		owning_ = false;
		}

		/** simple assignment does not take possesion of the cuda stream
		*/
		GpuStream& operator=(const GpuStream& other)
		{
		if (owning_ && stream_)
		checkRC(cudaStreamDestroy(stream_));
		stream_ = other.stream_;
		owning_ = false;
		return *this;
		}

		GpuStream(GpuStream&& other) noexcept {
		swap(other);
		@@ -58,7 +77,7 @@ public:
		}

		~GpuStream() {
		if (stream_)
		if (owning_ && stream_)
		checkRC(cudaStreamDestroy(stream_));
		}

		@@ -72,6 +91,7 @@ public:

		private:
		cudaStream_t stream_ = nullptr;
		bool owning_ = false;
		};

		#else // DCA_HAVE_GPU

+14 −0

Original line number	Diff line number	Diff line
		@@ -50,6 +50,11 @@ struct Memory<CPU> {
		static void setToZeroAsync(ScalarType* ptr, size_t size, const GpuStream& /s/) {
		setToZero(ptr, size);
		}

		template <typename ScalarType>
		static void setToZero(ScalarType* ptr, size_t size, const GpuStream& /s/) {
		setToZero(ptr, size);
		}
		};

		#ifdef DCA_HAVE_GPU
		@@ -77,6 +82,15 @@ struct Memory<GPU> {
		static void setToZeroAsync(ScalarType* ptr, size_t size, const GpuStream& stream) {
		cudaMemsetAsync(ptr, 0, size * sizeof(ScalarType), stream);
		}

		template <typename ScalarType>
		static void setToZero(ScalarType* ptr, size_t size, const GpuStream& stream) {
		checkRC(cudaMemsetAsync(ptr, 0, size * sizeof(ScalarType), stream));
		cudaEvent_t zero_event;
		checkRC(cudaEventCreateWithFlags(&zero_event, cudaEventBlockingSync));
		checkRC(cudaEventRecord(zero_event, stream));
		checkRC(cudaEventSynchronize(zero_event));
		}
		};
		#endif // DCA_HAVE_GPU

+12 −0

Original line number	Diff line number	Diff line
		@@ -108,6 +108,7 @@ public:
		void setAsync(const Container& rhs, const util::GpuStream& stream);

		void setToZeroAsync(const util::GpuStream& stream);
		void setToZero(const util::GpuStream& stream);

		template <class Container>
		void setAsync(const Container& rhs, int thred_id, int stream_id = 0);
		@@ -324,6 +325,17 @@ void Vector<ScalarType, device_name, Allocator>::setToZeroAsync(const util::GpuS
		#endif
		}

		template <typename ScalarType, DeviceType device_name, class Allocator>
		void Vector<ScalarType, device_name, Allocator>::setToZero(const util::GpuStream& stream [[maybe_unused]]) {
		dca::linalg::util::Memory<device_name>::setToZero(data_, size_, stream);
		}

		// template <typename ScalarType, DeviceType device_name, class Allocator>
		// void Vector<ScalarType, device_name, Allocator>::setToZero(const util::GpuStream& stream [[maybe_unused]]) {
		// // TODO: implement in copy.hpp.
		// dca::linalg::util::memory<device_name>::setToZero(data_, size_, stream);
		// }

		template <typename ScalarType, DeviceType device_name, class Allocator>
		template <class Container>
		void Vector<ScalarType, device_name, Allocator>::setAsync(const Container& rhs, const int thread_id,

+4 −3

Original line number	Diff line number	Diff line
		@@ -26,7 +26,7 @@ namespace util {
		struct OncePerLoopFlag {
		OncePerLoopFlag() : loop_done(-1) {}

		std::atomic<int> loop_done;
		int loop_done;
		dca::parallel::thread_traits::mutex_type mutex;
		};

		@@ -37,6 +37,8 @@ struct OncePerLoopFlag {
		template <class F, class... Args>
		void callOncePerLoop(OncePerLoopFlag& flag, const int loop_id, F&& f, Args&&... args)
		{
		std::scoped_lock<dca::parallel::thread_traits::mutex_type> lock(flag.mutex);

		const int currently_done = flag.loop_done;

		if (loop_id < 0)
		@@ -47,10 +49,9 @@ void callOncePerLoop(OncePerLoopFlag& flag, const int loop_id, F&& f, Args&&...
		else if (loop_id > currently_done + 1 && currently_done != -1)
		throw(std::logic_error("Loop id called out of order."));

		std::unique_lock<dca::parallel::thread_traits::mutex_type> lock(flag.mutex);
		// Check if flag.loop_done changed before locking the mutex.
		if (loop_id <= flag.loop_done)
		return;
		throw std::runtime_error("I don't think this how lock objects work.");

		// Run the task.
		f(args...);