Loading include/dca/function/function.hpp +1 −1 Original line number Diff line number Diff line Loading @@ -190,7 +190,7 @@ public: // Enable only if all arguments are integral to prevent subind_to_linind(int*, int) to resolve to // subind_to_linind(int...) rather than subind_to_linind(const int* const, int). template <typename... Ts> std::enable_if_t<util::if_all<std::is_integral<Ts>::value...>::value, int> subind_2_linind( std::enable_if_t<util::ifAll(std::is_integral_v<Ts>...), int> subind_2_linind( const Ts... subindices) const { // We need to cast all subindices to the same type for dmn_variadic. return dmn(static_cast<int>(subindices)...); Loading include/dca/linalg/matrix.hpp +18 −24 Original line number Diff line number Diff line Loading @@ -194,22 +194,20 @@ public: // Swaps the contents of the matrix, included the name, with those of rhs. void swapWithName(Matrix<ScalarType, device_name>& rhs); #ifdef DCA_HAVE_CUDA // Asynchronous assignment. // Asynchronous assignment (copy with stream = getStream(thread_id, stream_id)) // + synchronization of stream template <DeviceType rhs_device_name> void setAsync(const Matrix<ScalarType, rhs_device_name>& rhs, cudaStream_t stream); void set(const Matrix<ScalarType, rhs_device_name>& rhs, int thread_id, int stream_id); // Asynchronous assignment (copy with stream = getStream(thread_id, stream_id)) // Asynchronous assignment. template <DeviceType rhs_device_name> void setAsync(const Matrix<ScalarType, rhs_device_name>& rhs, int thread_id, int stream_id); void setAsync(const Matrix<ScalarType, rhs_device_name>& rhs, const util::CudaStream& stream); void setToZero(cudaStream_t stream); #else // Synchronous assignment fallback for SetAsync. // Asynchronous assignment (copy with stream = getStream(thread_id, stream_id)) template <DeviceType rhs_device_name> void setAsync(const Matrix<ScalarType, rhs_device_name>& rhs, int thread_id, int stream_id); #endif // DCA_HAVE_CUDA void setToZero(const util::CudaStream& stream); // Prints the values of the matrix elements. void print() const; Loading Loading @@ -413,12 +411,19 @@ void Matrix<ScalarType, device_name>::swapWithName(Matrix<ScalarType, device_nam swap(rhs); } #ifdef DCA_HAVE_CUDA template <typename ScalarType, DeviceType device_name> template <DeviceType rhs_device_name> void Matrix<ScalarType, device_name>::set(const Matrix<ScalarType, rhs_device_name>& rhs, int thread_id, int stream_id) { resize(rhs.size_); util::memoryCopy(data_, leadingDimension(), rhs.data_, rhs.leadingDimension(), size_, thread_id, stream_id); } template <typename ScalarType, DeviceType device_name> template <DeviceType rhs_device_name> void Matrix<ScalarType, device_name>::setAsync(const Matrix<ScalarType, rhs_device_name>& rhs, const cudaStream_t stream) { const util::CudaStream& stream) { resizeNoCopy(rhs.size_); util::memoryCopyAsync(data_, leadingDimension(), rhs.data_, rhs.leadingDimension(), size_, stream); } Loading @@ -431,21 +436,10 @@ void Matrix<ScalarType, device_name>::setAsync(const Matrix<ScalarType, rhs_devi } template <typename ScalarType, DeviceType device_name> void Matrix<ScalarType, device_name>::setToZero(cudaStream_t stream) { cudaMemsetAsync(data_, 0, leadingDimension() * nrCols() * sizeof(ScalarType), stream); } #else template <typename ScalarType, DeviceType device_name> template <DeviceType rhs_device_name> void Matrix<ScalarType, device_name>::setAsync(const Matrix<ScalarType, rhs_device_name>& rhs, int /*thread_id*/, int /*stream_id*/) { set(rhs); void Matrix<ScalarType, device_name>::setToZero(const util::CudaStream& stream) { util::Memory<device_name>::setToZeroAsync(data_, leadingDimension() * nrCols(), stream); } #endif // DCA_HAVE_CUDA template <typename ScalarType, DeviceType device_name> void Matrix<ScalarType, device_name>::print() const { if (device_name == GPU) Loading include/dca/linalg/multi_vector.hpp 0 → 100644 +104 −0 Original line number Diff line number Diff line // Copyright (C) 2018 ETH Zurich // Copyright (C) 2018 UT-Battelle, LLC // All rights reserved. // // See LICENSE for terms of usage. // See CITATION.md for citation guidelines, if DCA++ is used for scientific publications. // // Author: Giovanni Balduzzi (gbalduzz@itp.phys.ethz.ch) // // This class represents an AoS where each array has the same length but arbitrary type. #ifndef DCA_LINALG_MULTI_VECTOR_HPP #define DCA_LINALG_MULTI_VECTOR_HPP #include "dca/linalg/vector.hpp" #include "dca/linalg/util/cuda_stream.hpp" #include "dca/util/type_list.hpp" #include "dca/util/pack_operations.hpp" namespace dca { namespace linalg { // dca::linalg:: template <DeviceType device, typename... Ts> class MultiVector { public: using Types = dca::util::Typelist<Ts...>; template <unsigned id> using Type = typename dca::util::TypeAt<id, Types>::type; // Initialize each sub-array with size n. MultiVector(std::size_t n = 0); // Resize the container so that each sub-array has size n, invalidating references and values. void resizeNoCopy(std::size_t n); // Copy the values of rhs asynchronously. template <DeviceType other_device> void setAsync(const MultiVector<other_device, Ts...>& rhs, const linalg::util::CudaStream& stream) { size_ = rhs.size_; data_.setAsync(rhs.data_, stream); } // Returns a pointer to the beginning of the id-th array // Preconditions: 0 <= id < length(Ts...). template <unsigned id> auto get() -> Type<id>*; template <unsigned id> auto get() const -> const Type<id>*; std::size_t size() const { return size_; } // Allows setAsync to access the data on another device. template <DeviceType other_device, typename... T2s> friend class MultiVector; private: template <unsigned id> std::size_t offset() const; Vector<unsigned char, device> data_; std::size_t size_; }; template <DeviceType device, typename... Ts> MultiVector<device, Ts...>::MultiVector(std::size_t n) { resizeNoCopy(n); } template <DeviceType device, typename... Ts> void MultiVector<device, Ts...>::resizeNoCopy(std::size_t n) { data_.resizeNoCopy(n * dca::util::size_sum<Ts...>); size_ = n; } template <DeviceType device, typename... Ts> template <unsigned id> auto MultiVector<device, Ts...>::get() -> Type<id>* { unsigned char* ptr = data_.ptr() + offset<id>(); return reinterpret_cast<Type<id>*>(ptr); } template <DeviceType device, typename... Ts> template <unsigned id> auto MultiVector<device, Ts...>::get() const -> const Type<id>* { const unsigned char* ptr = data_.ptr() + offset<id>(); return reinterpret_cast<const Type<id>*>(ptr); } template <DeviceType device, typename... Ts> template <unsigned id> std::size_t MultiVector<device, Ts...>::offset() const { static_assert(id < sizeof...(Ts), "Invalid sub-array id."); constexpr unsigned size_t_sum = dca::util::size_sum<dca::util::Sublist<id, Ts...>>; return size_ * size_t_sum; } } // namespace linalg } // namespace dca #endif // DCA_LINALG_MULTI_VECTOR_HPP include/dca/linalg/util/copy.hpp +16 −3 Original line number Diff line number Diff line Loading @@ -16,6 +16,7 @@ #include <complex> #include <cstring> #include "dca/linalg/device_type.hpp" #include "cuda_stream.hpp" #ifdef DCA_HAVE_CUDA #include <cuda_runtime.h> Loading Loading @@ -141,10 +142,22 @@ void memoryCopy(ScalarType* dest, int ld_dest, const ScalarType* src, int ld_src memoryCopyCpu(dest, ld_dest, src, ld_src, size); } // Synchronous 1D memory copy fallback. template <typename ScalarType> void memoryCopyAsync(ScalarType* dest, const ScalarType* src, size_t size, const util::CudaStream& /*s*/) { memoryCopyCpu(dest, src, size); } template <typename ScalarType> void memoryCopyAsync(ScalarType* dest, int ld_dest, const ScalarType* src, int ld_src, std::pair<int, int> size, const util::CudaStream& /*s*/) { memoryCopyCpu(dest, ld_dest, src, ld_src, size); } #endif // DCA_HAVE_CUDA } // util } // linalg } // dca } // namespace util } // namespace linalg } // namespace dca #endif // DCA_LINALG_UTIL_COPY_HPP include/dca/linalg/util/magma_batched_gemm.hpp +3 −3 Original line number Diff line number Diff line Loading @@ -100,9 +100,9 @@ void MagmaBatchedGemm<ScalarType>::execute(const char transa, const char transb, const ScalarType beta, const int lda, const int ldb, const int ldc) { // TODO: store in a buffer if the performance gain is necessary. a_ptr_dev_.setAsync(a_ptr_, queue_); b_ptr_dev_.setAsync(b_ptr_, queue_); c_ptr_dev_.setAsync(c_ptr_, queue_); a_ptr_dev_.setAsync(a_ptr_, queue_.getStream()); b_ptr_dev_.setAsync(b_ptr_, queue_.getStream()); c_ptr_dev_.setAsync(c_ptr_, queue_.getStream()); copied_.record(queue_); const int n_batched = a_ptr_.size(); Loading Loading
include/dca/function/function.hpp +1 −1 Original line number Diff line number Diff line Loading @@ -190,7 +190,7 @@ public: // Enable only if all arguments are integral to prevent subind_to_linind(int*, int) to resolve to // subind_to_linind(int...) rather than subind_to_linind(const int* const, int). template <typename... Ts> std::enable_if_t<util::if_all<std::is_integral<Ts>::value...>::value, int> subind_2_linind( std::enable_if_t<util::ifAll(std::is_integral_v<Ts>...), int> subind_2_linind( const Ts... subindices) const { // We need to cast all subindices to the same type for dmn_variadic. return dmn(static_cast<int>(subindices)...); Loading
include/dca/linalg/matrix.hpp +18 −24 Original line number Diff line number Diff line Loading @@ -194,22 +194,20 @@ public: // Swaps the contents of the matrix, included the name, with those of rhs. void swapWithName(Matrix<ScalarType, device_name>& rhs); #ifdef DCA_HAVE_CUDA // Asynchronous assignment. // Asynchronous assignment (copy with stream = getStream(thread_id, stream_id)) // + synchronization of stream template <DeviceType rhs_device_name> void setAsync(const Matrix<ScalarType, rhs_device_name>& rhs, cudaStream_t stream); void set(const Matrix<ScalarType, rhs_device_name>& rhs, int thread_id, int stream_id); // Asynchronous assignment (copy with stream = getStream(thread_id, stream_id)) // Asynchronous assignment. template <DeviceType rhs_device_name> void setAsync(const Matrix<ScalarType, rhs_device_name>& rhs, int thread_id, int stream_id); void setAsync(const Matrix<ScalarType, rhs_device_name>& rhs, const util::CudaStream& stream); void setToZero(cudaStream_t stream); #else // Synchronous assignment fallback for SetAsync. // Asynchronous assignment (copy with stream = getStream(thread_id, stream_id)) template <DeviceType rhs_device_name> void setAsync(const Matrix<ScalarType, rhs_device_name>& rhs, int thread_id, int stream_id); #endif // DCA_HAVE_CUDA void setToZero(const util::CudaStream& stream); // Prints the values of the matrix elements. void print() const; Loading Loading @@ -413,12 +411,19 @@ void Matrix<ScalarType, device_name>::swapWithName(Matrix<ScalarType, device_nam swap(rhs); } #ifdef DCA_HAVE_CUDA template <typename ScalarType, DeviceType device_name> template <DeviceType rhs_device_name> void Matrix<ScalarType, device_name>::set(const Matrix<ScalarType, rhs_device_name>& rhs, int thread_id, int stream_id) { resize(rhs.size_); util::memoryCopy(data_, leadingDimension(), rhs.data_, rhs.leadingDimension(), size_, thread_id, stream_id); } template <typename ScalarType, DeviceType device_name> template <DeviceType rhs_device_name> void Matrix<ScalarType, device_name>::setAsync(const Matrix<ScalarType, rhs_device_name>& rhs, const cudaStream_t stream) { const util::CudaStream& stream) { resizeNoCopy(rhs.size_); util::memoryCopyAsync(data_, leadingDimension(), rhs.data_, rhs.leadingDimension(), size_, stream); } Loading @@ -431,21 +436,10 @@ void Matrix<ScalarType, device_name>::setAsync(const Matrix<ScalarType, rhs_devi } template <typename ScalarType, DeviceType device_name> void Matrix<ScalarType, device_name>::setToZero(cudaStream_t stream) { cudaMemsetAsync(data_, 0, leadingDimension() * nrCols() * sizeof(ScalarType), stream); } #else template <typename ScalarType, DeviceType device_name> template <DeviceType rhs_device_name> void Matrix<ScalarType, device_name>::setAsync(const Matrix<ScalarType, rhs_device_name>& rhs, int /*thread_id*/, int /*stream_id*/) { set(rhs); void Matrix<ScalarType, device_name>::setToZero(const util::CudaStream& stream) { util::Memory<device_name>::setToZeroAsync(data_, leadingDimension() * nrCols(), stream); } #endif // DCA_HAVE_CUDA template <typename ScalarType, DeviceType device_name> void Matrix<ScalarType, device_name>::print() const { if (device_name == GPU) Loading
include/dca/linalg/multi_vector.hpp 0 → 100644 +104 −0 Original line number Diff line number Diff line // Copyright (C) 2018 ETH Zurich // Copyright (C) 2018 UT-Battelle, LLC // All rights reserved. // // See LICENSE for terms of usage. // See CITATION.md for citation guidelines, if DCA++ is used for scientific publications. // // Author: Giovanni Balduzzi (gbalduzz@itp.phys.ethz.ch) // // This class represents an AoS where each array has the same length but arbitrary type. #ifndef DCA_LINALG_MULTI_VECTOR_HPP #define DCA_LINALG_MULTI_VECTOR_HPP #include "dca/linalg/vector.hpp" #include "dca/linalg/util/cuda_stream.hpp" #include "dca/util/type_list.hpp" #include "dca/util/pack_operations.hpp" namespace dca { namespace linalg { // dca::linalg:: template <DeviceType device, typename... Ts> class MultiVector { public: using Types = dca::util::Typelist<Ts...>; template <unsigned id> using Type = typename dca::util::TypeAt<id, Types>::type; // Initialize each sub-array with size n. MultiVector(std::size_t n = 0); // Resize the container so that each sub-array has size n, invalidating references and values. void resizeNoCopy(std::size_t n); // Copy the values of rhs asynchronously. template <DeviceType other_device> void setAsync(const MultiVector<other_device, Ts...>& rhs, const linalg::util::CudaStream& stream) { size_ = rhs.size_; data_.setAsync(rhs.data_, stream); } // Returns a pointer to the beginning of the id-th array // Preconditions: 0 <= id < length(Ts...). template <unsigned id> auto get() -> Type<id>*; template <unsigned id> auto get() const -> const Type<id>*; std::size_t size() const { return size_; } // Allows setAsync to access the data on another device. template <DeviceType other_device, typename... T2s> friend class MultiVector; private: template <unsigned id> std::size_t offset() const; Vector<unsigned char, device> data_; std::size_t size_; }; template <DeviceType device, typename... Ts> MultiVector<device, Ts...>::MultiVector(std::size_t n) { resizeNoCopy(n); } template <DeviceType device, typename... Ts> void MultiVector<device, Ts...>::resizeNoCopy(std::size_t n) { data_.resizeNoCopy(n * dca::util::size_sum<Ts...>); size_ = n; } template <DeviceType device, typename... Ts> template <unsigned id> auto MultiVector<device, Ts...>::get() -> Type<id>* { unsigned char* ptr = data_.ptr() + offset<id>(); return reinterpret_cast<Type<id>*>(ptr); } template <DeviceType device, typename... Ts> template <unsigned id> auto MultiVector<device, Ts...>::get() const -> const Type<id>* { const unsigned char* ptr = data_.ptr() + offset<id>(); return reinterpret_cast<const Type<id>*>(ptr); } template <DeviceType device, typename... Ts> template <unsigned id> std::size_t MultiVector<device, Ts...>::offset() const { static_assert(id < sizeof...(Ts), "Invalid sub-array id."); constexpr unsigned size_t_sum = dca::util::size_sum<dca::util::Sublist<id, Ts...>>; return size_ * size_t_sum; } } // namespace linalg } // namespace dca #endif // DCA_LINALG_MULTI_VECTOR_HPP
include/dca/linalg/util/copy.hpp +16 −3 Original line number Diff line number Diff line Loading @@ -16,6 +16,7 @@ #include <complex> #include <cstring> #include "dca/linalg/device_type.hpp" #include "cuda_stream.hpp" #ifdef DCA_HAVE_CUDA #include <cuda_runtime.h> Loading Loading @@ -141,10 +142,22 @@ void memoryCopy(ScalarType* dest, int ld_dest, const ScalarType* src, int ld_src memoryCopyCpu(dest, ld_dest, src, ld_src, size); } // Synchronous 1D memory copy fallback. template <typename ScalarType> void memoryCopyAsync(ScalarType* dest, const ScalarType* src, size_t size, const util::CudaStream& /*s*/) { memoryCopyCpu(dest, src, size); } template <typename ScalarType> void memoryCopyAsync(ScalarType* dest, int ld_dest, const ScalarType* src, int ld_src, std::pair<int, int> size, const util::CudaStream& /*s*/) { memoryCopyCpu(dest, ld_dest, src, ld_src, size); } #endif // DCA_HAVE_CUDA } // util } // linalg } // dca } // namespace util } // namespace linalg } // namespace dca #endif // DCA_LINALG_UTIL_COPY_HPP
include/dca/linalg/util/magma_batched_gemm.hpp +3 −3 Original line number Diff line number Diff line Loading @@ -100,9 +100,9 @@ void MagmaBatchedGemm<ScalarType>::execute(const char transa, const char transb, const ScalarType beta, const int lda, const int ldb, const int ldc) { // TODO: store in a buffer if the performance gain is necessary. a_ptr_dev_.setAsync(a_ptr_, queue_); b_ptr_dev_.setAsync(b_ptr_, queue_); c_ptr_dev_.setAsync(c_ptr_, queue_); a_ptr_dev_.setAsync(a_ptr_, queue_.getStream()); b_ptr_dev_.setAsync(b_ptr_, queue_.getStream()); c_ptr_dev_.setAsync(c_ptr_, queue_.getStream()); copied_.record(queue_); const int n_batched = a_ptr_.size(); Loading