Merge branch 'generic_container' into autocorrelation (be9df424) · Commits · NDIP / Tool Sources / Direct-Geometry Spectroscopy / DCA / DCA Main

include/dca/linalg/util/copy.hpp

+16 −3

Original line number	Diff line number	Diff line
		@@ -16,6 +16,7 @@
		#include <complex>
		#include <cstring>
		#include "dca/linalg/device_type.hpp"
		#include "cuda_stream.hpp"

		#ifdef DCA_HAVE_CUDA
		#include <cuda_runtime.h>
		@@ -141,10 +142,22 @@ void memoryCopy(ScalarType* dest, int ld_dest, const ScalarType* src, int ld_src
		memoryCopyCpu(dest, ld_dest, src, ld_src, size);
		}

		// Synchronous 1D memory copy fallback.
		template <typename ScalarType>
		void memoryCopyAsync(ScalarType* dest, const ScalarType* src, size_t size,
		const util::CudaStream& /s/) {
		memoryCopyCpu(dest, src, size);
		}
		template <typename ScalarType>
		void memoryCopyAsync(ScalarType* dest, int ld_dest, const ScalarType* src, int ld_src,
		std::pair<int, int> size, const util::CudaStream& /s/) {
		memoryCopyCpu(dest, ld_dest, src, ld_src, size);
		}

		#endif // DCA_HAVE_CUDA

		} // util
		} // linalg
		} // dca
		} // namespace util
		} // namespace linalg
		} // namespace dca

		#endif // DCA_LINALG_UTIL_COPY_HPP

include/dca/linalg/util/cuda_stream.hpp

+5 −0

Original line number	Diff line number	Diff line
		@@ -31,10 +31,15 @@ public:
		}

		CudaStream(const CudaStream& other) = delete;
		CudaStream& operator=(const CudaStream& other) = delete;

		CudaStream(CudaStream&& other) {
		std::swap(stream_, other.stream_);
		}
		CudaStream& operator=(CudaStream&& other) {
		std::swap(stream_, other.stream_);
		return *this;
		}

		void sync() const {
		checkRC(cudaStreamSynchronize(stream_));

include/dca/linalg/util/magma_queue.hpp

+16 −1

Original line number	Diff line number	Diff line
		@@ -36,8 +36,17 @@ public:
		}

		MagmaQueue(const MagmaQueue& rhs) = delete;
		MagmaQueue& operator=(const MagmaQueue& rhs) = delete;

		MagmaQueue(MagmaQueue&& rhs) = default;
		MagmaQueue(MagmaQueue&& rhs) : CudaStream(std::move(rhs)) {
		swapMembers(rhs);
		}

		MagmaQueue& operator=(MagmaQueue&& rhs) {
		CudaStream::operator=(std::move(rhs));
		swapMembers(rhs);
		return *this;
		}

		~MagmaQueue() {
		magma_queue_destroy(queue_);
		@@ -50,6 +59,12 @@ public:
		}

		private:
		void swapMembers(MagmaQueue& rhs) {
		std::swap(cublas_handle_, rhs.cublas_handle_);
		std::swap(cusparse_handle_, rhs.cusparse_handle_);
		std::swap(queue_, rhs.queue_);
		}

		magma_queue_t queue_ = nullptr;
		cublasHandle_t cublas_handle_ = nullptr;
		cusparseHandle_t cusparse_handle_ = nullptr;

include/dca/linalg/util/magma_vbatched_gemm.hpp

+15 −14

Original line number	Diff line number	Diff line
		@@ -19,6 +19,7 @@
		#include "dca/linalg/lapack/magma.hpp"
		#include "dca/linalg/util/allocators/vectors_typedefs.hpp"
		#include "dca/linalg/util/cuda_event.hpp"
		#include "dca/linalg/util/magma_queue.hpp"
		#include "dca/linalg/vector.hpp"

		namespace dca {
		@@ -30,10 +31,10 @@ template <typename ScalarType>
		class MagmaVBatchedGemm {
		public:
		// Creates a plan for a batched gemm with variable size.
		MagmaVBatchedGemm(magma_queue_t queue);
		MagmaVBatchedGemm(const linalg::util::MagmaQueue& queue);
		// Creates a plan and allocates the memory for the arguments of `size`
		// multiplications.
		MagmaVBatchedGemm(int size, magma_queue_t queue);
		MagmaVBatchedGemm(int size, const linalg::util::MagmaQueue& queue);

		// Allocates the memory for the arguments of `size` multiplications.
		void reserve(int size);
		@@ -52,8 +53,8 @@ public:
		void synchronizeCopy();

		private:
		magma_queue_t queue_;
		const cudaStream_t stream_;
		const linalg::util::MagmaQueue& queue_;
		const linalg::util::CudaStream& stream_;
		CudaEvent copied_;

		linalg::util::HostVector<const ScalarType*> a_ptr_, b_ptr_;
		@@ -67,11 +68,11 @@ private:
		};

		template <typename ScalarType>
		MagmaVBatchedGemm<ScalarType>::MagmaVBatchedGemm(magma_queue_t queue)
		: queue_(queue), stream_(magma_queue_get_cuda_stream(queue_)), m_max_(0), n_max_(0), k_max_(0) {}
		MagmaVBatchedGemm<ScalarType>::MagmaVBatchedGemm(const linalg::util::MagmaQueue& queue)
		: queue_(queue), stream_(queue), m_max_(0), n_max_(0), k_max_(0) {}

		template <typename ScalarType>
		MagmaVBatchedGemm<ScalarType>::MagmaVBatchedGemm(const int size, magma_queue_t queue)
		MagmaVBatchedGemm<ScalarType>::MagmaVBatchedGemm(const int size, const linalg::util::MagmaQueue& queue)
		: MagmaVBatchedGemm(queue) {
		reserve(size);
		}
		@@ -126,10 +127,10 @@ void MagmaVBatchedGemm<ScalarType>::execute(const char transa, const char transb
		copied_.record(stream_);

		const int n_batched = a_ptr_.size();
		magma::magmablas_gemm_vbatched_max_nocheck(transa, transb, m_dev_.ptr(), n_dev_.ptr(), k_dev_.ptr(),
		ScalarType(1), a_ptr_dev_.ptr(), lda_dev_.ptr(), b_ptr_dev_.ptr(),
		ldb_dev_.ptr(), ScalarType(0), c_ptr_dev_.ptr(), ldc_dev_.ptr(),
		n_batched, m_max_, n_max_, k_max_, queue_);
		magma::magmablas_gemm_vbatched_max_nocheck(
		transa, transb, m_dev_.ptr(), n_dev_.ptr(), k_dev_.ptr(), ScalarType(1), a_ptr_dev_.ptr(),
		lda_dev_.ptr(), b_ptr_dev_.ptr(), ldb_dev_.ptr(), ScalarType(0), c_ptr_dev_.ptr(),
		ldc_dev_.ptr(), n_batched, m_max_, n_max_, k_max_, queue_);

		assert(cudaPeekAtLastError() == cudaSuccess);
		}
		@@ -151,9 +152,9 @@ void MagmaVBatchedGemm<ScalarType>::synchronizeCopy() {
		m_max_ = n_max_ = k_max_ = 0;
		}

		} // util
		} // linalg
		} // dca
		} // namespace util
		} // namespace linalg
		} // namespace dca

		#endif // DCA_HAVE_CUDA
		#endif // DCA_LINALG_UTIL_MAGMA_VBATCHED_GEMM_HPP

include/dca/linalg/util/memory.hpp

+9 −4

Original line number	Diff line number	Diff line
		@@ -19,6 +19,7 @@
		#include <stdexcept>

		#include "dca/linalg/device_type.hpp"
		#include "dca/linalg/util/cuda_stream.hpp"
		#include "dca/util/ignore.hpp"

		#ifdef DCA_HAVE_CUDA
		@@ -48,11 +49,10 @@ struct Memory<CPU> {
		std::complex<ScalarType>* ptr, size_t size) {
		std::memset(static_cast<void>(ptr), 0, size sizeof(std::complex<ScalarType>));
		}

		// Do nothing for non arithmetic types.
		template <typename ScalarType>
		static std::enable_if_t<std::is_arithmetic<ScalarType>::value == false, void> setToZero(
		ScalarType /ptr/, size_t /size/) {}
		static void setToZeroAsync(ScalarType* ptr, size_t size, const CudaStream& /s/) {
		setToZero(ptr, size);
		}
		};

		#ifdef DCA_HAVE_CUDA
		@@ -75,6 +75,11 @@ struct Memory<GPU> {
		template <typename ScalarType>
		static std::enable_if_t<std::is_arithmetic<ScalarType>::value == false, void> setToZero(
		ScalarType /ptr/, size_t /size/) {}

		template <typename ScalarType>
		static void setToZeroAsync(ScalarType* ptr, size_t size, const CudaStream& stream) {
		cudaMemsetAsync(ptr, 0, size * sizeof(ScalarType), stream);
		}
		};
		#endif // DCA_HAVE_CUDA