Merge pull request #169 from gbalduzz/generic_container (7b6f95c4) · Commits · NDIP / Tool Sources / Direct-Geometry Spectroscopy / DCA / DCA Main

include/dca/function/function.hpp

+1 −1

Original line number	Diff line number	Diff line
		@@ -190,7 +190,7 @@ public:
		// Enable only if all arguments are integral to prevent subind_to_linind(int*, int) to resolve to
		// subind_to_linind(int...) rather than subind_to_linind(const int* const, int).
		template <typename... Ts>
		std::enable_if_t<util::if_all<std::is_integral<Ts>::value...>::value, int> subind_2_linind(
		std::enable_if_t<util::ifAll(std::is_integral_v<Ts>...), int> subind_2_linind(
		const Ts... subindices) const {
		// We need to cast all subindices to the same type for dmn_variadic.
		return dmn(static_cast<int>(subindices)...);

include/dca/linalg/matrix.hpp

+18 −24

Original line number	Diff line number	Diff line
		@@ -194,22 +194,20 @@ public:
		// Swaps the contents of the matrix, included the name, with those of rhs.
		void swapWithName(Matrix<ScalarType, device_name>& rhs);

		#ifdef DCA_HAVE_CUDA
		// Asynchronous assignment.
		// Asynchronous assignment (copy with stream = getStream(thread_id, stream_id))
		// + synchronization of stream
		template <DeviceType rhs_device_name>
		void setAsync(const Matrix<ScalarType, rhs_device_name>& rhs, cudaStream_t stream);
		void set(const Matrix<ScalarType, rhs_device_name>& rhs, int thread_id, int stream_id);

		// Asynchronous assignment (copy with stream = getStream(thread_id, stream_id))
		// Asynchronous assignment.
		template <DeviceType rhs_device_name>
		void setAsync(const Matrix<ScalarType, rhs_device_name>& rhs, int thread_id, int stream_id);
		void setAsync(const Matrix<ScalarType, rhs_device_name>& rhs, const util::CudaStream& stream);

		void setToZero(cudaStream_t stream);
		#else
		// Synchronous assignment fallback for SetAsync.
		// Asynchronous assignment (copy with stream = getStream(thread_id, stream_id))
		template <DeviceType rhs_device_name>
		void setAsync(const Matrix<ScalarType, rhs_device_name>& rhs, int thread_id, int stream_id);

		#endif // DCA_HAVE_CUDA
		void setToZero(const util::CudaStream& stream);

		// Prints the values of the matrix elements.
		void print() const;
		@@ -413,12 +411,19 @@ void Matrix<ScalarType, device_name>::swapWithName(Matrix<ScalarType, device_nam
		swap(rhs);
		}

		#ifdef DCA_HAVE_CUDA
		template <typename ScalarType, DeviceType device_name>
		template <DeviceType rhs_device_name>
		void Matrix<ScalarType, device_name>::set(const Matrix<ScalarType, rhs_device_name>& rhs,
		int thread_id, int stream_id) {
		resize(rhs.size_);
		util::memoryCopy(data_, leadingDimension(), rhs.data_, rhs.leadingDimension(), size_, thread_id,
		stream_id);
		}

		template <typename ScalarType, DeviceType device_name>
		template <DeviceType rhs_device_name>
		void Matrix<ScalarType, device_name>::setAsync(const Matrix<ScalarType, rhs_device_name>& rhs,
		const cudaStream_t stream) {
		const util::CudaStream& stream) {
		resizeNoCopy(rhs.size_);
		util::memoryCopyAsync(data_, leadingDimension(), rhs.data_, rhs.leadingDimension(), size_, stream);
		}
		@@ -431,21 +436,10 @@ void Matrix<ScalarType, device_name>::setAsync(const Matrix<ScalarType, rhs_devi
		}

		template <typename ScalarType, DeviceType device_name>
		void Matrix<ScalarType, device_name>::setToZero(cudaStream_t stream) {
		cudaMemsetAsync(data_, 0, leadingDimension() * nrCols() * sizeof(ScalarType), stream);
		}

		#else

		template <typename ScalarType, DeviceType device_name>
		template <DeviceType rhs_device_name>
		void Matrix<ScalarType, device_name>::setAsync(const Matrix<ScalarType, rhs_device_name>& rhs,
		int /thread_id/, int /stream_id/) {
		set(rhs);
		void Matrix<ScalarType, device_name>::setToZero(const util::CudaStream& stream) {
		util::Memory<device_name>::setToZeroAsync(data_, leadingDimension() * nrCols(), stream);
		}

		#endif // DCA_HAVE_CUDA

		template <typename ScalarType, DeviceType device_name>
		void Matrix<ScalarType, device_name>::print() const {
		if (device_name == GPU)

include/dca/linalg/multi_vector.hpp

0 → 100644

+104 −0

Original line number	Diff line number	Diff line
		// Copyright (C) 2018 ETH Zurich
		// Copyright (C) 2018 UT-Battelle, LLC
		// All rights reserved.
		//
		// See LICENSE for terms of usage.
		// See CITATION.md for citation guidelines, if DCA++ is used for scientific publications.
		//
		// Author: Giovanni Balduzzi (gbalduzz@itp.phys.ethz.ch)
		//
		// This class represents an AoS where each array has the same length but arbitrary type.

		#ifndef DCA_LINALG_MULTI_VECTOR_HPP
		#define DCA_LINALG_MULTI_VECTOR_HPP

		#include "dca/linalg/vector.hpp"
		#include "dca/linalg/util/cuda_stream.hpp"
		#include "dca/util/type_list.hpp"
		#include "dca/util/pack_operations.hpp"

		namespace dca {
		namespace linalg {
		// dca::linalg::

		template <DeviceType device, typename... Ts>
		class MultiVector {
		public:
		using Types = dca::util::Typelist<Ts...>;
		template <unsigned id>
		using Type = typename dca::util::TypeAt<id, Types>::type;

		// Initialize each sub-array with size n.
		MultiVector(std::size_t n = 0);

		// Resize the container so that each sub-array has size n, invalidating references and values.
		void resizeNoCopy(std::size_t n);

		// Copy the values of rhs asynchronously.
		template <DeviceType other_device>
		void setAsync(const MultiVector<other_device, Ts...>& rhs, const linalg::util::CudaStream& stream) {
		size_ = rhs.size_;
		data_.setAsync(rhs.data_, stream);
		}

		// Returns a pointer to the beginning of the id-th array
		// Preconditions: 0 <= id < length(Ts...).
		template <unsigned id>
		auto get() -> Type<id>*;
		template <unsigned id>
		auto get() const -> const Type<id>*;

		std::size_t size() const {
		return size_;
		}

		// Allows setAsync to access the data on another device.
		template <DeviceType other_device, typename... T2s>
		friend class MultiVector;

		private:
		template <unsigned id>
		std::size_t offset() const;

		Vector<unsigned char, device> data_;
		std::size_t size_;
		};

		template <DeviceType device, typename... Ts>
		MultiVector<device, Ts...>::MultiVector(std::size_t n) {
		resizeNoCopy(n);
		}

		template <DeviceType device, typename... Ts>
		void MultiVector<device, Ts...>::resizeNoCopy(std::size_t n) {
		data_.resizeNoCopy(n * dca::util::size_sum<Ts...>);
		size_ = n;
		}

		template <DeviceType device, typename... Ts>
		template <unsigned id>
		auto MultiVector<device, Ts...>::get() -> Type<id>* {
		unsigned char* ptr = data_.ptr() + offset<id>();
		return reinterpret_cast<Type<id>*>(ptr);
		}

		template <DeviceType device, typename... Ts>
		template <unsigned id>
		auto MultiVector<device, Ts...>::get() const -> const Type<id>* {
		const unsigned char* ptr = data_.ptr() + offset<id>();
		return reinterpret_cast<const Type<id>*>(ptr);
		}

		template <DeviceType device, typename... Ts>
		template <unsigned id>
		std::size_t MultiVector<device, Ts...>::offset() const {
		static_assert(id < sizeof...(Ts), "Invalid sub-array id.");

		constexpr unsigned size_t_sum = dca::util::size_sum<dca::util::Sublist<id, Ts...>>;
		return size_ * size_t_sum;
		}

		} // namespace linalg
		} // namespace dca

		#endif // DCA_LINALG_MULTI_VECTOR_HPP

include/dca/linalg/util/copy.hpp

+16 −3

Original line number	Diff line number	Diff line
		@@ -16,6 +16,7 @@
		#include <complex>
		#include <cstring>
		#include "dca/linalg/device_type.hpp"
		#include "cuda_stream.hpp"

		#ifdef DCA_HAVE_CUDA
		#include <cuda_runtime.h>
		@@ -141,10 +142,22 @@ void memoryCopy(ScalarType* dest, int ld_dest, const ScalarType* src, int ld_src
		memoryCopyCpu(dest, ld_dest, src, ld_src, size);
		}

		// Synchronous 1D memory copy fallback.
		template <typename ScalarType>
		void memoryCopyAsync(ScalarType* dest, const ScalarType* src, size_t size,
		const util::CudaStream& /s/) {
		memoryCopyCpu(dest, src, size);
		}
		template <typename ScalarType>
		void memoryCopyAsync(ScalarType* dest, int ld_dest, const ScalarType* src, int ld_src,
		std::pair<int, int> size, const util::CudaStream& /s/) {
		memoryCopyCpu(dest, ld_dest, src, ld_src, size);
		}

		#endif // DCA_HAVE_CUDA

		} // util
		} // linalg
		} // dca
		} // namespace util
		} // namespace linalg
		} // namespace dca

		#endif // DCA_LINALG_UTIL_COPY_HPP

include/dca/linalg/util/magma_batched_gemm.hpp

+3 −3

Original line number	Diff line number	Diff line
		@@ -100,9 +100,9 @@ void MagmaBatchedGemm<ScalarType>::execute(const char transa, const char transb,
		const ScalarType beta, const int lda, const int ldb,
		const int ldc) {
		// TODO: store in a buffer if the performance gain is necessary.
		a_ptr_dev_.setAsync(a_ptr_, queue_);
		b_ptr_dev_.setAsync(b_ptr_, queue_);
		c_ptr_dev_.setAsync(c_ptr_, queue_);
		a_ptr_dev_.setAsync(a_ptr_, queue_.getStream());
		b_ptr_dev_.setAsync(b_ptr_, queue_.getStream());
		c_ptr_dev_.setAsync(c_ptr_, queue_.getStream());
		copied_.record(queue_);

		const int n_batched = a_ptr_.size();