Merge remote-tracking branch 'origin/master' into single_precision_ctaux (e1f93abd) · Commits · NDIP / Tool Sources / Direct-Geometry Spectroscopy / DCA / DCA Main

build-aux/summit.cmake

+3 −3

Original line number	Diff line number	Diff line
		@@ -14,10 +14,10 @@ mark_as_advanced(DCA_ESSL_INCLUDES)

		# Use jsrun for executing the tests.
		set(TEST_RUNNER "jsrun" CACHE STRING "Command for executing (MPI) programs.")
		set(MPIEXEC_NUMPROC_FLAG "-a" CACHE STRING
		set(MPIEXEC_NUMPROC_FLAG "-n" CACHE STRING
		"Flag used by TEST_RUNNER to specify the number of processes.")
		# Use 1 resource set with 1 GPU and 8 cores for executing the tests.
		set(MPIEXEC_PREFLAGS "-n 1 -g 1 -c 8" CACHE STRING
		# Use 1 resource set with 1 GPU and 5 cores for executing the tests.
		set(MPIEXEC_PREFLAGS "-a 1 -g 1 -c 5" CACHE STRING
		"Flags to pass to TEST_RUNNER directly before the executable to run.")
		# The flag "--smpiargs=none" is needed to execute tests with no MPI functionalities.
		set(SMPIARGS_FLAG_NOMPI "--smpiargs=none" CACHE STRING

include/dca/linalg/matrix.hpp

+3 −2

Original line number	Diff line number	Diff line
		@@ -38,6 +38,7 @@ public:
		using ThisType = Matrix<ScalarType, device_name>;
		using ValueType = ScalarType;
		using Allocator = util::DefaultAllocator<ScalarType, device_name>;
		constexpr static DeviceType device = device_name;

		Matrix(const std::string& name = default_name_);

		@@ -517,7 +518,7 @@ std::size_t Matrix<ScalarType, device_name>::deviceFingerprint() const {
		return 0;
		}

		} // linalg
		} // dca
		} // namespace linalg
		} // namespace dca

		#endif // DCA_LINALG_MATRIX_HPP

include/dca/linalg/matrix_view.hpp

+3 −0

Original line number	Diff line number	Diff line
		@@ -28,6 +28,9 @@ namespace linalg {
		template <typename ScalarType, DeviceType device_name = linalg::CPU>
		class MatrixView {
		public:
		using ValueType = ScalarType;
		constexpr static DeviceType device = device_name;

		using Pair = std::pair<int, int>;
		MatrixView(ScalarType* data, Pair size);
		MatrixView(ScalarType* data, Pair size, int ld);

include/dca/linalg/matrixop.hpp

+81 −4

Original line number	Diff line number	Diff line
		@@ -15,6 +15,7 @@
		// - real
		// - insertCol, insertRow (for CPU matrices only)
		// - inverse
		// - inverseAndDeterminant
		// - removeCol, removeCols, removeRow, removeRows, removeRowAndCol, removeRowAndCols
		// - scaleCol, scaleRow, scaleRows
		// - swapCol, swapRow, swapRowAndCol
		@@ -22,6 +23,7 @@
		// - gemm
		// - multiply
		// - trsm
		// - determinant
		// - eigensolver (non-symmetric / symmetric / Hermitian)
		// - pseudoInverse

		@@ -128,10 +130,9 @@ inline void copyRow(const Matrix<Scalar, device_name>& mat_x, int ix,
		// Preconditions: i_x.size() <= i_y.size(), mat_x.nrCols() == mat_y.nrCols()
		// 0 <= i_x[i] < mat_x.nrRows() for 0 <= i < i_x.size(),
		// 0 <= i_y[i] < mat_y.nrRows() for 0 <= i < i_x.size().
		template <typename Scalar>
		inline void copyRows(const Matrix<Scalar, CPU>& mat_x, const Vector<int, CPU>& i_x,
		Matrix<Scalar, CPU>& mat_y, const Vector<int, CPU>& i_y, int /thread_id/ = 0,
		int /stream_id/ = 0) {
		template <typename Scalar, class Vec>
		inline void copyRows(const Matrix<Scalar, CPU>& mat_x, const Vec& i_x, Matrix<Scalar, CPU>& mat_y,
		const Vec& i_y, int /thread_id/ = 0, int /stream_id/ = 0) {
		assert(i_x.size() <= i_y.size());
		assert(mat_x.nrCols() == mat_y.nrCols());

		@@ -150,6 +151,19 @@ inline void copyRows(const Matrix<Scalar, GPU>& mat_x, const Vector<int, GPU>& i
		blas::copyRows(mat_x.nrCols(), i_x.size(), i_x.ptr(), mat_x.ptr(), mat_x.leadingDimension(),
		i_y.ptr(), mat_y.ptr(), mat_y.leadingDimension(), thread_id, stream_id);
		}

		// Copies the i_x rows of mat_x into mat_y, for 0 <= i < i_x.size().
		// In/Out: mat_y
		// Preconditions: mat_x.nrCols() == mat_y.nrCols()
		// 0 <= i_x[i] < mat_x.nrRows() for 0 <= i < i_x.size().
		template <typename Scalar>
		inline void copyRows(const Matrix<Scalar, GPU>& mat_x, const Vector<int, GPU>& i_x,
		Matrix<Scalar, GPU>& mat_y, int thread_id = 0, int stream_id = 0) {
		assert(mat_x.nrCols() == mat_y.nrCols());

		blas::copyRows(mat_x.nrCols(), i_x.size(), i_x.ptr(), mat_x.ptr(), mat_x.leadingDimension(),
		mat_y.ptr(), mat_y.leadingDimension(), thread_id, stream_id);
		}
		#endif // DCA_HAVE_CUDA

		// Returns the difference of two matrices in terms of max_i,j(\|a(i, j) - b(i, j)\|).
		@@ -325,6 +339,33 @@ void smallInverse(Matrix<Scalar, CPU>& m_inv) {
		smallInverse(m_inv, ipiv, work);
		}

		// Computes in place the inverse of mat and the determinant of the inverse.
		// In/Out: mat
		// Returns: the determinant of mat^-1
		// Precondition: mat is a non-singular real matrix.
		template <typename Scalar, template <typename, DeviceType> class MatrixType>
		Scalar inverseAndDeterminant(MatrixType<Scalar, CPU>& mat) {
		assert(mat.is_square());
		std::vector<int> ipiv(mat.nrRows());

		lapack::UseDevice<CPU>::getrf(mat.nrRows(), mat.nrCols(), mat.ptr(), mat.leadingDimension(),
		ipiv.data());

		Scalar det = 1;
		for (int i = 0; i < mat.nrCols(); ++i) {
		det *= mat(i, i);
		if (ipiv[i] != i + 1)
		det *= -1;
		}

		const int lwork = util::getInverseWorkSize(mat);
		std::vector<Scalar> work(lwork);
		lapack::UseDevice<CPU>::getri(mat.nrRows(), mat.ptr(), mat.leadingDimension(), ipiv.data(),
		work.data(), lwork);

		return 1. / det;
		}

		// Remove the j-th column. The data is moved accordingly.
		// In/Out: mat
		// Preconditions: 0 <= j < mat.nrCols().
		@@ -1160,6 +1201,42 @@ void pseudoInverse(const Matrix<Scalar, CPU>& a, Matrix<Scalar, CPU>& a_inv, dou
		}
		}

		// Computes (in place) the determinant of the matrix.
		// Returns: determinant.
		// Postcondition: M is its LU decomposition.
		template <template <typename, DeviceType> class MatrixType, typename Scalar, DeviceType device>
		double determinantIP(MatrixType<Scalar, device>& M) {
		assert(M.nrCols() == M.nrRows());
		const int n = M.nrCols();
		std::vector<int> ipiv(n);

		try {
		lapack::getrf(n, n, M.ptr(), M.leadingDimension(), ipiv.data());
		}
		catch (lapack::util::LapackException& err) {
		if (err.info() > 0)
		return 0;
		else
		throw(std::logic_error("LU decomposition failed."));
		}

		double det = 1.;
		for (int i = 0; i < n; i++) {
		det *= M(i, i);
		if (ipiv[i] != i + 1)
		det *= -1;
		}
		return det;
		}

		// Copy and computes the determinant of the matrix.
		// Returns: determinant.
		template <typename Scalar>
		double determinant(const Matrix<Scalar, CPU>& M) {
		Matrix<Scalar, CPU> M_copy(M);
		return determinantIP(M_copy);
		}

		template <typename Scalar>
		bool areNear(const Matrix<Scalar, CPU>& A, const Matrix<Scalar, CPU>& B, const double err = 1e-16) {
		if (A.size() != B.size())

include/dca/linalg/util/cuda_stream.hpp

+24 −6

Original line number	Diff line number	Diff line
		@@ -11,16 +11,19 @@

		#ifndef DCA_LINALG_UTIL_CUDA_STREAM_HPP
		#define DCA_LINALG_UTIL_CUDA_STREAM_HPP
		#ifdef DCA_HAVE_CUDA

		#include <cuda.h>
		#include <magma.h>
		#ifdef DCA_HAVE_CUDA
		#include <cuda_runtime.h>
		#include "dca/linalg/util/error_cuda.hpp"
		#endif // DCA_HAVE_CUDA

		namespace dca {
		namespace linalg {
		namespace util {
		// dca::linalg::util::

		#ifdef DCA_HAVE_CUDA

		class CudaStream {
		public:
		CudaStream() {
		@@ -33,6 +36,10 @@ public:
		std::swap(stream_, other.stream_);
		}

		void sync() const {
		checkRC(cudaStreamSynchronize(stream_));
		}

		~CudaStream() {
		if (stream_)
		cudaStreamDestroy(stream_);
		@@ -46,9 +53,20 @@ private:
		cudaStream_t stream_ = nullptr;
		};

		} // util
		} // linalg
		} // dca
		#else // DCA_HAVE_CUDA

		// Mock object.
		class CudaStream {
		public:
		CudaStream() = default;

		void sync() const {}
		};

		#endif // DCA_HAVE_CUDA

		} // namespace util
		} // namespace linalg
		} // namespace dca

		#endif // DCA_LINALG_UTIL_CUDA_STREAM_HPP