Loading build-aux/summit.cmake +3 −3 Original line number Diff line number Diff line Loading @@ -14,10 +14,10 @@ mark_as_advanced(DCA_ESSL_INCLUDES) # Use jsrun for executing the tests. set(TEST_RUNNER "jsrun" CACHE STRING "Command for executing (MPI) programs.") set(MPIEXEC_NUMPROC_FLAG "-a" CACHE STRING set(MPIEXEC_NUMPROC_FLAG "-n" CACHE STRING "Flag used by TEST_RUNNER to specify the number of processes.") # Use 1 resource set with 1 GPU and 8 cores for executing the tests. set(MPIEXEC_PREFLAGS "-n 1 -g 1 -c 8" CACHE STRING # Use 1 resource set with 1 GPU and 5 cores for executing the tests. set(MPIEXEC_PREFLAGS "-a 1 -g 1 -c 5" CACHE STRING "Flags to pass to TEST_RUNNER directly before the executable to run.") # The flag "--smpiargs=none" is needed to execute tests with no MPI functionalities. set(SMPIARGS_FLAG_NOMPI "--smpiargs=none" CACHE STRING Loading include/dca/linalg/matrix.hpp +3 −2 Original line number Diff line number Diff line Loading @@ -38,6 +38,7 @@ public: using ThisType = Matrix<ScalarType, device_name>; using ValueType = ScalarType; using Allocator = util::DefaultAllocator<ScalarType, device_name>; constexpr static DeviceType device = device_name; Matrix(const std::string& name = default_name_); Loading Loading @@ -517,7 +518,7 @@ std::size_t Matrix<ScalarType, device_name>::deviceFingerprint() const { return 0; } } // linalg } // dca } // namespace linalg } // namespace dca #endif // DCA_LINALG_MATRIX_HPP include/dca/linalg/matrix_view.hpp +3 −0 Original line number Diff line number Diff line Loading @@ -28,6 +28,9 @@ namespace linalg { template <typename ScalarType, DeviceType device_name = linalg::CPU> class MatrixView { public: using ValueType = ScalarType; constexpr static DeviceType device = device_name; using Pair = std::pair<int, int>; MatrixView(ScalarType* data, Pair size); MatrixView(ScalarType* data, Pair size, int ld); Loading include/dca/linalg/matrixop.hpp +81 −4 Original line number Diff line number Diff line Loading @@ -15,6 +15,7 @@ // - real // - insertCol, insertRow (for CPU matrices only) // - inverse // - inverseAndDeterminant // - removeCol, removeCols, removeRow, removeRows, removeRowAndCol, removeRowAndCols // - scaleCol, scaleRow, scaleRows // - swapCol, swapRow, swapRowAndCol Loading @@ -22,6 +23,7 @@ // - gemm // - multiply // - trsm // - determinant // - eigensolver (non-symmetric / symmetric / Hermitian) // - pseudoInverse Loading Loading @@ -128,10 +130,9 @@ inline void copyRow(const Matrix<Scalar, device_name>& mat_x, int ix, // Preconditions: i_x.size() <= i_y.size(), mat_x.nrCols() == mat_y.nrCols() // 0 <= i_x[i] < mat_x.nrRows() for 0 <= i < i_x.size(), // 0 <= i_y[i] < mat_y.nrRows() for 0 <= i < i_x.size(). template <typename Scalar> inline void copyRows(const Matrix<Scalar, CPU>& mat_x, const Vector<int, CPU>& i_x, Matrix<Scalar, CPU>& mat_y, const Vector<int, CPU>& i_y, int /*thread_id*/ = 0, int /*stream_id*/ = 0) { template <typename Scalar, class Vec> inline void copyRows(const Matrix<Scalar, CPU>& mat_x, const Vec& i_x, Matrix<Scalar, CPU>& mat_y, const Vec& i_y, int /*thread_id*/ = 0, int /*stream_id*/ = 0) { assert(i_x.size() <= i_y.size()); assert(mat_x.nrCols() == mat_y.nrCols()); Loading @@ -150,6 +151,19 @@ inline void copyRows(const Matrix<Scalar, GPU>& mat_x, const Vector<int, GPU>& i blas::copyRows(mat_x.nrCols(), i_x.size(), i_x.ptr(), mat_x.ptr(), mat_x.leadingDimension(), i_y.ptr(), mat_y.ptr(), mat_y.leadingDimension(), thread_id, stream_id); } // Copies the i_x rows of mat_x into mat_y, for 0 <= i < i_x.size(). // In/Out: mat_y // Preconditions: mat_x.nrCols() == mat_y.nrCols() // 0 <= i_x[i] < mat_x.nrRows() for 0 <= i < i_x.size(). template <typename Scalar> inline void copyRows(const Matrix<Scalar, GPU>& mat_x, const Vector<int, GPU>& i_x, Matrix<Scalar, GPU>& mat_y, int thread_id = 0, int stream_id = 0) { assert(mat_x.nrCols() == mat_y.nrCols()); blas::copyRows(mat_x.nrCols(), i_x.size(), i_x.ptr(), mat_x.ptr(), mat_x.leadingDimension(), mat_y.ptr(), mat_y.leadingDimension(), thread_id, stream_id); } #endif // DCA_HAVE_CUDA // Returns the difference of two matrices in terms of max_i,j(|a(i, j) - b(i, j)|). Loading Loading @@ -325,6 +339,33 @@ void smallInverse(Matrix<Scalar, CPU>& m_inv) { smallInverse(m_inv, ipiv, work); } // Computes in place the inverse of mat and the determinant of the inverse. // In/Out: mat // Returns: the determinant of mat^-1 // Precondition: mat is a non-singular real matrix. template <typename Scalar, template <typename, DeviceType> class MatrixType> Scalar inverseAndDeterminant(MatrixType<Scalar, CPU>& mat) { assert(mat.is_square()); std::vector<int> ipiv(mat.nrRows()); lapack::UseDevice<CPU>::getrf(mat.nrRows(), mat.nrCols(), mat.ptr(), mat.leadingDimension(), ipiv.data()); Scalar det = 1; for (int i = 0; i < mat.nrCols(); ++i) { det *= mat(i, i); if (ipiv[i] != i + 1) det *= -1; } const int lwork = util::getInverseWorkSize(mat); std::vector<Scalar> work(lwork); lapack::UseDevice<CPU>::getri(mat.nrRows(), mat.ptr(), mat.leadingDimension(), ipiv.data(), work.data(), lwork); return 1. / det; } // Remove the j-th column. The data is moved accordingly. // In/Out: mat // Preconditions: 0 <= j < mat.nrCols(). Loading Loading @@ -1160,6 +1201,42 @@ void pseudoInverse(const Matrix<Scalar, CPU>& a, Matrix<Scalar, CPU>& a_inv, dou } } // Computes (in place) the determinant of the matrix. // Returns: determinant. // Postcondition: M is its LU decomposition. template <template <typename, DeviceType> class MatrixType, typename Scalar, DeviceType device> double determinantIP(MatrixType<Scalar, device>& M) { assert(M.nrCols() == M.nrRows()); const int n = M.nrCols(); std::vector<int> ipiv(n); try { lapack::getrf(n, n, M.ptr(), M.leadingDimension(), ipiv.data()); } catch (lapack::util::LapackException& err) { if (err.info() > 0) return 0; else throw(std::logic_error("LU decomposition failed.")); } double det = 1.; for (int i = 0; i < n; i++) { det *= M(i, i); if (ipiv[i] != i + 1) det *= -1; } return det; } // Copy and computes the determinant of the matrix. // Returns: determinant. template <typename Scalar> double determinant(const Matrix<Scalar, CPU>& M) { Matrix<Scalar, CPU> M_copy(M); return determinantIP(M_copy); } template <typename Scalar> bool areNear(const Matrix<Scalar, CPU>& A, const Matrix<Scalar, CPU>& B, const double err = 1e-16) { if (A.size() != B.size()) Loading include/dca/linalg/util/cuda_stream.hpp +24 −6 Original line number Diff line number Diff line Loading @@ -11,16 +11,19 @@ #ifndef DCA_LINALG_UTIL_CUDA_STREAM_HPP #define DCA_LINALG_UTIL_CUDA_STREAM_HPP #ifdef DCA_HAVE_CUDA #include <cuda.h> #include <magma.h> #ifdef DCA_HAVE_CUDA #include <cuda_runtime.h> #include "dca/linalg/util/error_cuda.hpp" #endif // DCA_HAVE_CUDA namespace dca { namespace linalg { namespace util { // dca::linalg::util:: #ifdef DCA_HAVE_CUDA class CudaStream { public: CudaStream() { Loading @@ -33,6 +36,10 @@ public: std::swap(stream_, other.stream_); } void sync() const { checkRC(cudaStreamSynchronize(stream_)); } ~CudaStream() { if (stream_) cudaStreamDestroy(stream_); Loading @@ -46,9 +53,20 @@ private: cudaStream_t stream_ = nullptr; }; } // util } // linalg } // dca #else // DCA_HAVE_CUDA // Mock object. class CudaStream { public: CudaStream() = default; void sync() const {} }; #endif // DCA_HAVE_CUDA } // namespace util } // namespace linalg } // namespace dca #endif // DCA_LINALG_UTIL_CUDA_STREAM_HPP Loading
build-aux/summit.cmake +3 −3 Original line number Diff line number Diff line Loading @@ -14,10 +14,10 @@ mark_as_advanced(DCA_ESSL_INCLUDES) # Use jsrun for executing the tests. set(TEST_RUNNER "jsrun" CACHE STRING "Command for executing (MPI) programs.") set(MPIEXEC_NUMPROC_FLAG "-a" CACHE STRING set(MPIEXEC_NUMPROC_FLAG "-n" CACHE STRING "Flag used by TEST_RUNNER to specify the number of processes.") # Use 1 resource set with 1 GPU and 8 cores for executing the tests. set(MPIEXEC_PREFLAGS "-n 1 -g 1 -c 8" CACHE STRING # Use 1 resource set with 1 GPU and 5 cores for executing the tests. set(MPIEXEC_PREFLAGS "-a 1 -g 1 -c 5" CACHE STRING "Flags to pass to TEST_RUNNER directly before the executable to run.") # The flag "--smpiargs=none" is needed to execute tests with no MPI functionalities. set(SMPIARGS_FLAG_NOMPI "--smpiargs=none" CACHE STRING Loading
include/dca/linalg/matrix.hpp +3 −2 Original line number Diff line number Diff line Loading @@ -38,6 +38,7 @@ public: using ThisType = Matrix<ScalarType, device_name>; using ValueType = ScalarType; using Allocator = util::DefaultAllocator<ScalarType, device_name>; constexpr static DeviceType device = device_name; Matrix(const std::string& name = default_name_); Loading Loading @@ -517,7 +518,7 @@ std::size_t Matrix<ScalarType, device_name>::deviceFingerprint() const { return 0; } } // linalg } // dca } // namespace linalg } // namespace dca #endif // DCA_LINALG_MATRIX_HPP
include/dca/linalg/matrix_view.hpp +3 −0 Original line number Diff line number Diff line Loading @@ -28,6 +28,9 @@ namespace linalg { template <typename ScalarType, DeviceType device_name = linalg::CPU> class MatrixView { public: using ValueType = ScalarType; constexpr static DeviceType device = device_name; using Pair = std::pair<int, int>; MatrixView(ScalarType* data, Pair size); MatrixView(ScalarType* data, Pair size, int ld); Loading
include/dca/linalg/matrixop.hpp +81 −4 Original line number Diff line number Diff line Loading @@ -15,6 +15,7 @@ // - real // - insertCol, insertRow (for CPU matrices only) // - inverse // - inverseAndDeterminant // - removeCol, removeCols, removeRow, removeRows, removeRowAndCol, removeRowAndCols // - scaleCol, scaleRow, scaleRows // - swapCol, swapRow, swapRowAndCol Loading @@ -22,6 +23,7 @@ // - gemm // - multiply // - trsm // - determinant // - eigensolver (non-symmetric / symmetric / Hermitian) // - pseudoInverse Loading Loading @@ -128,10 +130,9 @@ inline void copyRow(const Matrix<Scalar, device_name>& mat_x, int ix, // Preconditions: i_x.size() <= i_y.size(), mat_x.nrCols() == mat_y.nrCols() // 0 <= i_x[i] < mat_x.nrRows() for 0 <= i < i_x.size(), // 0 <= i_y[i] < mat_y.nrRows() for 0 <= i < i_x.size(). template <typename Scalar> inline void copyRows(const Matrix<Scalar, CPU>& mat_x, const Vector<int, CPU>& i_x, Matrix<Scalar, CPU>& mat_y, const Vector<int, CPU>& i_y, int /*thread_id*/ = 0, int /*stream_id*/ = 0) { template <typename Scalar, class Vec> inline void copyRows(const Matrix<Scalar, CPU>& mat_x, const Vec& i_x, Matrix<Scalar, CPU>& mat_y, const Vec& i_y, int /*thread_id*/ = 0, int /*stream_id*/ = 0) { assert(i_x.size() <= i_y.size()); assert(mat_x.nrCols() == mat_y.nrCols()); Loading @@ -150,6 +151,19 @@ inline void copyRows(const Matrix<Scalar, GPU>& mat_x, const Vector<int, GPU>& i blas::copyRows(mat_x.nrCols(), i_x.size(), i_x.ptr(), mat_x.ptr(), mat_x.leadingDimension(), i_y.ptr(), mat_y.ptr(), mat_y.leadingDimension(), thread_id, stream_id); } // Copies the i_x rows of mat_x into mat_y, for 0 <= i < i_x.size(). // In/Out: mat_y // Preconditions: mat_x.nrCols() == mat_y.nrCols() // 0 <= i_x[i] < mat_x.nrRows() for 0 <= i < i_x.size(). template <typename Scalar> inline void copyRows(const Matrix<Scalar, GPU>& mat_x, const Vector<int, GPU>& i_x, Matrix<Scalar, GPU>& mat_y, int thread_id = 0, int stream_id = 0) { assert(mat_x.nrCols() == mat_y.nrCols()); blas::copyRows(mat_x.nrCols(), i_x.size(), i_x.ptr(), mat_x.ptr(), mat_x.leadingDimension(), mat_y.ptr(), mat_y.leadingDimension(), thread_id, stream_id); } #endif // DCA_HAVE_CUDA // Returns the difference of two matrices in terms of max_i,j(|a(i, j) - b(i, j)|). Loading Loading @@ -325,6 +339,33 @@ void smallInverse(Matrix<Scalar, CPU>& m_inv) { smallInverse(m_inv, ipiv, work); } // Computes in place the inverse of mat and the determinant of the inverse. // In/Out: mat // Returns: the determinant of mat^-1 // Precondition: mat is a non-singular real matrix. template <typename Scalar, template <typename, DeviceType> class MatrixType> Scalar inverseAndDeterminant(MatrixType<Scalar, CPU>& mat) { assert(mat.is_square()); std::vector<int> ipiv(mat.nrRows()); lapack::UseDevice<CPU>::getrf(mat.nrRows(), mat.nrCols(), mat.ptr(), mat.leadingDimension(), ipiv.data()); Scalar det = 1; for (int i = 0; i < mat.nrCols(); ++i) { det *= mat(i, i); if (ipiv[i] != i + 1) det *= -1; } const int lwork = util::getInverseWorkSize(mat); std::vector<Scalar> work(lwork); lapack::UseDevice<CPU>::getri(mat.nrRows(), mat.ptr(), mat.leadingDimension(), ipiv.data(), work.data(), lwork); return 1. / det; } // Remove the j-th column. The data is moved accordingly. // In/Out: mat // Preconditions: 0 <= j < mat.nrCols(). Loading Loading @@ -1160,6 +1201,42 @@ void pseudoInverse(const Matrix<Scalar, CPU>& a, Matrix<Scalar, CPU>& a_inv, dou } } // Computes (in place) the determinant of the matrix. // Returns: determinant. // Postcondition: M is its LU decomposition. template <template <typename, DeviceType> class MatrixType, typename Scalar, DeviceType device> double determinantIP(MatrixType<Scalar, device>& M) { assert(M.nrCols() == M.nrRows()); const int n = M.nrCols(); std::vector<int> ipiv(n); try { lapack::getrf(n, n, M.ptr(), M.leadingDimension(), ipiv.data()); } catch (lapack::util::LapackException& err) { if (err.info() > 0) return 0; else throw(std::logic_error("LU decomposition failed.")); } double det = 1.; for (int i = 0; i < n; i++) { det *= M(i, i); if (ipiv[i] != i + 1) det *= -1; } return det; } // Copy and computes the determinant of the matrix. // Returns: determinant. template <typename Scalar> double determinant(const Matrix<Scalar, CPU>& M) { Matrix<Scalar, CPU> M_copy(M); return determinantIP(M_copy); } template <typename Scalar> bool areNear(const Matrix<Scalar, CPU>& A, const Matrix<Scalar, CPU>& B, const double err = 1e-16) { if (A.size() != B.size()) Loading
include/dca/linalg/util/cuda_stream.hpp +24 −6 Original line number Diff line number Diff line Loading @@ -11,16 +11,19 @@ #ifndef DCA_LINALG_UTIL_CUDA_STREAM_HPP #define DCA_LINALG_UTIL_CUDA_STREAM_HPP #ifdef DCA_HAVE_CUDA #include <cuda.h> #include <magma.h> #ifdef DCA_HAVE_CUDA #include <cuda_runtime.h> #include "dca/linalg/util/error_cuda.hpp" #endif // DCA_HAVE_CUDA namespace dca { namespace linalg { namespace util { // dca::linalg::util:: #ifdef DCA_HAVE_CUDA class CudaStream { public: CudaStream() { Loading @@ -33,6 +36,10 @@ public: std::swap(stream_, other.stream_); } void sync() const { checkRC(cudaStreamSynchronize(stream_)); } ~CudaStream() { if (stream_) cudaStreamDestroy(stream_); Loading @@ -46,9 +53,20 @@ private: cudaStream_t stream_ = nullptr; }; } // util } // linalg } // dca #else // DCA_HAVE_CUDA // Mock object. class CudaStream { public: CudaStream() = default; void sync() const {} }; #endif // DCA_HAVE_CUDA } // namespace util } // namespace linalg } // namespace dca #endif // DCA_LINALG_UTIL_CUDA_STREAM_HPP