Loading include/dca/linalg/util/copy.hpp +16 −3 Original line number Diff line number Diff line Loading @@ -16,6 +16,7 @@ #include <complex> #include <cstring> #include "dca/linalg/device_type.hpp" #include "cuda_stream.hpp" #ifdef DCA_HAVE_CUDA #include <cuda_runtime.h> Loading Loading @@ -141,10 +142,22 @@ void memoryCopy(ScalarType* dest, int ld_dest, const ScalarType* src, int ld_src memoryCopyCpu(dest, ld_dest, src, ld_src, size); } // Synchronous 1D memory copy fallback. template <typename ScalarType> void memoryCopyAsync(ScalarType* dest, const ScalarType* src, size_t size, const util::CudaStream& /*s*/) { memoryCopyCpu(dest, src, size); } template <typename ScalarType> void memoryCopyAsync(ScalarType* dest, int ld_dest, const ScalarType* src, int ld_src, std::pair<int, int> size, const util::CudaStream& /*s*/) { memoryCopyCpu(dest, ld_dest, src, ld_src, size); } #endif // DCA_HAVE_CUDA } // util } // linalg } // dca } // namespace util } // namespace linalg } // namespace dca #endif // DCA_LINALG_UTIL_COPY_HPP include/dca/linalg/util/cuda_stream.hpp +5 −0 Original line number Diff line number Diff line Loading @@ -31,10 +31,15 @@ public: } CudaStream(const CudaStream& other) = delete; CudaStream& operator=(const CudaStream& other) = delete; CudaStream(CudaStream&& other) { std::swap(stream_, other.stream_); } CudaStream& operator=(CudaStream&& other) { std::swap(stream_, other.stream_); return *this; } void sync() const { checkRC(cudaStreamSynchronize(stream_)); Loading include/dca/linalg/util/magma_queue.hpp +16 −1 Original line number Diff line number Diff line Loading @@ -36,8 +36,17 @@ public: } MagmaQueue(const MagmaQueue& rhs) = delete; MagmaQueue& operator=(const MagmaQueue& rhs) = delete; MagmaQueue(MagmaQueue&& rhs) = default; MagmaQueue(MagmaQueue&& rhs) : CudaStream(std::move(rhs)) { swapMembers(rhs); } MagmaQueue& operator=(MagmaQueue&& rhs) { CudaStream::operator=(std::move(rhs)); swapMembers(rhs); return *this; } ~MagmaQueue() { magma_queue_destroy(queue_); Loading @@ -50,6 +59,12 @@ public: } private: void swapMembers(MagmaQueue& rhs) { std::swap(cublas_handle_, rhs.cublas_handle_); std::swap(cusparse_handle_, rhs.cusparse_handle_); std::swap(queue_, rhs.queue_); } magma_queue_t queue_ = nullptr; cublasHandle_t cublas_handle_ = nullptr; cusparseHandle_t cusparse_handle_ = nullptr; Loading include/dca/linalg/util/magma_vbatched_gemm.hpp +15 −14 Original line number Diff line number Diff line Loading @@ -19,6 +19,7 @@ #include "dca/linalg/lapack/magma.hpp" #include "dca/linalg/util/allocators/vectors_typedefs.hpp" #include "dca/linalg/util/cuda_event.hpp" #include "dca/linalg/util/magma_queue.hpp" #include "dca/linalg/vector.hpp" namespace dca { Loading @@ -30,10 +31,10 @@ template <typename ScalarType> class MagmaVBatchedGemm { public: // Creates a plan for a batched gemm with variable size. MagmaVBatchedGemm(magma_queue_t queue); MagmaVBatchedGemm(const linalg::util::MagmaQueue& queue); // Creates a plan and allocates the memory for the arguments of `size` // multiplications. MagmaVBatchedGemm(int size, magma_queue_t queue); MagmaVBatchedGemm(int size, const linalg::util::MagmaQueue& queue); // Allocates the memory for the arguments of `size` multiplications. void reserve(int size); Loading @@ -52,8 +53,8 @@ public: void synchronizeCopy(); private: magma_queue_t queue_; const cudaStream_t stream_; const linalg::util::MagmaQueue& queue_; const linalg::util::CudaStream& stream_; CudaEvent copied_; linalg::util::HostVector<const ScalarType*> a_ptr_, b_ptr_; Loading @@ -67,11 +68,11 @@ private: }; template <typename ScalarType> MagmaVBatchedGemm<ScalarType>::MagmaVBatchedGemm(magma_queue_t queue) : queue_(queue), stream_(magma_queue_get_cuda_stream(queue_)), m_max_(0), n_max_(0), k_max_(0) {} MagmaVBatchedGemm<ScalarType>::MagmaVBatchedGemm(const linalg::util::MagmaQueue& queue) : queue_(queue), stream_(queue), m_max_(0), n_max_(0), k_max_(0) {} template <typename ScalarType> MagmaVBatchedGemm<ScalarType>::MagmaVBatchedGemm(const int size, magma_queue_t queue) MagmaVBatchedGemm<ScalarType>::MagmaVBatchedGemm(const int size, const linalg::util::MagmaQueue& queue) : MagmaVBatchedGemm(queue) { reserve(size); } Loading Loading @@ -126,10 +127,10 @@ void MagmaVBatchedGemm<ScalarType>::execute(const char transa, const char transb copied_.record(stream_); const int n_batched = a_ptr_.size(); magma::magmablas_gemm_vbatched_max_nocheck(transa, transb, m_dev_.ptr(), n_dev_.ptr(), k_dev_.ptr(), ScalarType(1), a_ptr_dev_.ptr(), lda_dev_.ptr(), b_ptr_dev_.ptr(), ldb_dev_.ptr(), ScalarType(0), c_ptr_dev_.ptr(), ldc_dev_.ptr(), n_batched, m_max_, n_max_, k_max_, queue_); magma::magmablas_gemm_vbatched_max_nocheck( transa, transb, m_dev_.ptr(), n_dev_.ptr(), k_dev_.ptr(), ScalarType(1), a_ptr_dev_.ptr(), lda_dev_.ptr(), b_ptr_dev_.ptr(), ldb_dev_.ptr(), ScalarType(0), c_ptr_dev_.ptr(), ldc_dev_.ptr(), n_batched, m_max_, n_max_, k_max_, queue_); assert(cudaPeekAtLastError() == cudaSuccess); } Loading @@ -151,9 +152,9 @@ void MagmaVBatchedGemm<ScalarType>::synchronizeCopy() { m_max_ = n_max_ = k_max_ = 0; } } // util } // linalg } // dca } // namespace util } // namespace linalg } // namespace dca #endif // DCA_HAVE_CUDA #endif // DCA_LINALG_UTIL_MAGMA_VBATCHED_GEMM_HPP include/dca/linalg/util/memory.hpp +9 −4 Original line number Diff line number Diff line Loading @@ -19,6 +19,7 @@ #include <stdexcept> #include "dca/linalg/device_type.hpp" #include "dca/linalg/util/cuda_stream.hpp" #include "dca/util/ignore.hpp" #ifdef DCA_HAVE_CUDA Loading Loading @@ -48,11 +49,10 @@ struct Memory<CPU> { std::complex<ScalarType>* ptr, size_t size) { std::memset(static_cast<void*>(ptr), 0, size * sizeof(std::complex<ScalarType>)); } // Do nothing for non arithmetic types. template <typename ScalarType> static std::enable_if_t<std::is_arithmetic<ScalarType>::value == false, void> setToZero( ScalarType /*ptr*/, size_t /*size*/) {} static void setToZeroAsync(ScalarType* ptr, size_t size, const CudaStream& /*s*/) { setToZero(ptr, size); } }; #ifdef DCA_HAVE_CUDA Loading @@ -75,6 +75,11 @@ struct Memory<GPU> { template <typename ScalarType> static std::enable_if_t<std::is_arithmetic<ScalarType>::value == false, void> setToZero( ScalarType /*ptr*/, size_t /*size*/) {} template <typename ScalarType> static void setToZeroAsync(ScalarType* ptr, size_t size, const CudaStream& stream) { cudaMemsetAsync(ptr, 0, size * sizeof(ScalarType), stream); } }; #endif // DCA_HAVE_CUDA Loading Loading
include/dca/linalg/util/copy.hpp +16 −3 Original line number Diff line number Diff line Loading @@ -16,6 +16,7 @@ #include <complex> #include <cstring> #include "dca/linalg/device_type.hpp" #include "cuda_stream.hpp" #ifdef DCA_HAVE_CUDA #include <cuda_runtime.h> Loading Loading @@ -141,10 +142,22 @@ void memoryCopy(ScalarType* dest, int ld_dest, const ScalarType* src, int ld_src memoryCopyCpu(dest, ld_dest, src, ld_src, size); } // Synchronous 1D memory copy fallback. template <typename ScalarType> void memoryCopyAsync(ScalarType* dest, const ScalarType* src, size_t size, const util::CudaStream& /*s*/) { memoryCopyCpu(dest, src, size); } template <typename ScalarType> void memoryCopyAsync(ScalarType* dest, int ld_dest, const ScalarType* src, int ld_src, std::pair<int, int> size, const util::CudaStream& /*s*/) { memoryCopyCpu(dest, ld_dest, src, ld_src, size); } #endif // DCA_HAVE_CUDA } // util } // linalg } // dca } // namespace util } // namespace linalg } // namespace dca #endif // DCA_LINALG_UTIL_COPY_HPP
include/dca/linalg/util/cuda_stream.hpp +5 −0 Original line number Diff line number Diff line Loading @@ -31,10 +31,15 @@ public: } CudaStream(const CudaStream& other) = delete; CudaStream& operator=(const CudaStream& other) = delete; CudaStream(CudaStream&& other) { std::swap(stream_, other.stream_); } CudaStream& operator=(CudaStream&& other) { std::swap(stream_, other.stream_); return *this; } void sync() const { checkRC(cudaStreamSynchronize(stream_)); Loading
include/dca/linalg/util/magma_queue.hpp +16 −1 Original line number Diff line number Diff line Loading @@ -36,8 +36,17 @@ public: } MagmaQueue(const MagmaQueue& rhs) = delete; MagmaQueue& operator=(const MagmaQueue& rhs) = delete; MagmaQueue(MagmaQueue&& rhs) = default; MagmaQueue(MagmaQueue&& rhs) : CudaStream(std::move(rhs)) { swapMembers(rhs); } MagmaQueue& operator=(MagmaQueue&& rhs) { CudaStream::operator=(std::move(rhs)); swapMembers(rhs); return *this; } ~MagmaQueue() { magma_queue_destroy(queue_); Loading @@ -50,6 +59,12 @@ public: } private: void swapMembers(MagmaQueue& rhs) { std::swap(cublas_handle_, rhs.cublas_handle_); std::swap(cusparse_handle_, rhs.cusparse_handle_); std::swap(queue_, rhs.queue_); } magma_queue_t queue_ = nullptr; cublasHandle_t cublas_handle_ = nullptr; cusparseHandle_t cusparse_handle_ = nullptr; Loading
include/dca/linalg/util/magma_vbatched_gemm.hpp +15 −14 Original line number Diff line number Diff line Loading @@ -19,6 +19,7 @@ #include "dca/linalg/lapack/magma.hpp" #include "dca/linalg/util/allocators/vectors_typedefs.hpp" #include "dca/linalg/util/cuda_event.hpp" #include "dca/linalg/util/magma_queue.hpp" #include "dca/linalg/vector.hpp" namespace dca { Loading @@ -30,10 +31,10 @@ template <typename ScalarType> class MagmaVBatchedGemm { public: // Creates a plan for a batched gemm with variable size. MagmaVBatchedGemm(magma_queue_t queue); MagmaVBatchedGemm(const linalg::util::MagmaQueue& queue); // Creates a plan and allocates the memory for the arguments of `size` // multiplications. MagmaVBatchedGemm(int size, magma_queue_t queue); MagmaVBatchedGemm(int size, const linalg::util::MagmaQueue& queue); // Allocates the memory for the arguments of `size` multiplications. void reserve(int size); Loading @@ -52,8 +53,8 @@ public: void synchronizeCopy(); private: magma_queue_t queue_; const cudaStream_t stream_; const linalg::util::MagmaQueue& queue_; const linalg::util::CudaStream& stream_; CudaEvent copied_; linalg::util::HostVector<const ScalarType*> a_ptr_, b_ptr_; Loading @@ -67,11 +68,11 @@ private: }; template <typename ScalarType> MagmaVBatchedGemm<ScalarType>::MagmaVBatchedGemm(magma_queue_t queue) : queue_(queue), stream_(magma_queue_get_cuda_stream(queue_)), m_max_(0), n_max_(0), k_max_(0) {} MagmaVBatchedGemm<ScalarType>::MagmaVBatchedGemm(const linalg::util::MagmaQueue& queue) : queue_(queue), stream_(queue), m_max_(0), n_max_(0), k_max_(0) {} template <typename ScalarType> MagmaVBatchedGemm<ScalarType>::MagmaVBatchedGemm(const int size, magma_queue_t queue) MagmaVBatchedGemm<ScalarType>::MagmaVBatchedGemm(const int size, const linalg::util::MagmaQueue& queue) : MagmaVBatchedGemm(queue) { reserve(size); } Loading Loading @@ -126,10 +127,10 @@ void MagmaVBatchedGemm<ScalarType>::execute(const char transa, const char transb copied_.record(stream_); const int n_batched = a_ptr_.size(); magma::magmablas_gemm_vbatched_max_nocheck(transa, transb, m_dev_.ptr(), n_dev_.ptr(), k_dev_.ptr(), ScalarType(1), a_ptr_dev_.ptr(), lda_dev_.ptr(), b_ptr_dev_.ptr(), ldb_dev_.ptr(), ScalarType(0), c_ptr_dev_.ptr(), ldc_dev_.ptr(), n_batched, m_max_, n_max_, k_max_, queue_); magma::magmablas_gemm_vbatched_max_nocheck( transa, transb, m_dev_.ptr(), n_dev_.ptr(), k_dev_.ptr(), ScalarType(1), a_ptr_dev_.ptr(), lda_dev_.ptr(), b_ptr_dev_.ptr(), ldb_dev_.ptr(), ScalarType(0), c_ptr_dev_.ptr(), ldc_dev_.ptr(), n_batched, m_max_, n_max_, k_max_, queue_); assert(cudaPeekAtLastError() == cudaSuccess); } Loading @@ -151,9 +152,9 @@ void MagmaVBatchedGemm<ScalarType>::synchronizeCopy() { m_max_ = n_max_ = k_max_ = 0; } } // util } // linalg } // dca } // namespace util } // namespace linalg } // namespace dca #endif // DCA_HAVE_CUDA #endif // DCA_LINALG_UTIL_MAGMA_VBATCHED_GEMM_HPP
include/dca/linalg/util/memory.hpp +9 −4 Original line number Diff line number Diff line Loading @@ -19,6 +19,7 @@ #include <stdexcept> #include "dca/linalg/device_type.hpp" #include "dca/linalg/util/cuda_stream.hpp" #include "dca/util/ignore.hpp" #ifdef DCA_HAVE_CUDA Loading Loading @@ -48,11 +49,10 @@ struct Memory<CPU> { std::complex<ScalarType>* ptr, size_t size) { std::memset(static_cast<void*>(ptr), 0, size * sizeof(std::complex<ScalarType>)); } // Do nothing for non arithmetic types. template <typename ScalarType> static std::enable_if_t<std::is_arithmetic<ScalarType>::value == false, void> setToZero( ScalarType /*ptr*/, size_t /*size*/) {} static void setToZeroAsync(ScalarType* ptr, size_t size, const CudaStream& /*s*/) { setToZero(ptr, size); } }; #ifdef DCA_HAVE_CUDA Loading @@ -75,6 +75,11 @@ struct Memory<GPU> { template <typename ScalarType> static std::enable_if_t<std::is_arithmetic<ScalarType>::value == false, void> setToZero( ScalarType /*ptr*/, size_t /*size*/) {} template <typename ScalarType> static void setToZeroAsync(ScalarType* ptr, size_t size, const CudaStream& stream) { cudaMemsetAsync(ptr, 0, size * sizeof(ScalarType), stream); } }; #endif // DCA_HAVE_CUDA Loading