Loading cmake/dca_config.cmake +30 −6 Original line number Diff line number Diff line Loading @@ -201,15 +201,29 @@ configure_file("${PROJECT_SOURCE_DIR}/include/dca/config/rng.hpp.in" ################################################################################ # Select the cluster solver. set(DCA_CLUSTER_SOLVER "CT-AUX" CACHE STRING "The cluster solver for the DCA(+) loop. Options are: CT-AUX | SS-CT-HYB.") set_property(CACHE DCA_CLUSTER_SOLVER PROPERTY STRINGS CT-AUX SS-CT-HYB) "The cluster solver for the DCA(+) loop. Options are: CT-AUX | CT-INT | SS-CT-HYB.") set_property(CACHE DCA_CLUSTER_SOLVER PROPERTY STRINGS CT-AUX CT-INT SS-CT-HYB) if (DCA_CLUSTER_SOLVER STREQUAL "CT-AUX") if (DCA_CLUSTER_SOLVER STREQUAL "CT-INT") set(DCA_CLUSTER_SOLVER_NAME dca::phys::solver::CT_INT) set(DCA_CLUSTER_SOLVER_INCLUDE "dca/phys/dca_step/cluster_solver/ctint/ctint_cluster_solver.hpp") set(DCA_USE_CTINT_SUBMATRIX ON CACHE BOOL "Use submatrix updates if the CT-INT solver is selected.") if(DCA_USE_CTINT_SUBMATRIX) set(DCA_CLUSTER_SOLVER_TYPE "dca::phys::solver::CtintClusterSolver<walker_device, ParametersType, true>") else() set(DCA_CLUSTER_SOLVER_TYPE "dca::phys::solver::CtintClusterSolver<walker_device, ParametersType, false>") endif() elseif (DCA_CLUSTER_SOLVER STREQUAL "CT-AUX") set(DCA_CLUSTER_SOLVER_NAME dca::phys::solver::CT_AUX) set(DCA_CLUSTER_SOLVER_TYPE "dca::phys::solver::CtauxClusterSolver<walker_device, ParametersType, DcaDataType, DIST>") set(DCA_CLUSTER_SOLVER_INCLUDE "dca/phys/dca_step/cluster_solver/ctaux/ctaux_cluster_solver.hpp") elseif (DCA_CLUSTER_SOLVER STREQUAL "SS-CT-HYB") set(DCA_CLUSTER_SOLVER_NAME dca::phys::solver::SS_CT_HYB) set(DCA_CLUSTER_SOLVER_TYPE "dca::phys::solver::SsCtHybClusterSolver<walker_device, ParametersType, DcaDataType, DIST>") Loading @@ -222,7 +236,8 @@ elseif (DCA_CLUSTER_SOLVER STREQUAL "SS-CT-HYB") # "dca/phys/dca_step/cluster_solver/high_temperature_series_expansion/high_temperature_series_expansion_solver.hpp") else() message(FATAL_ERROR "Please set DCA_CLUSTER_SOLVER to a valid option: CT-AUX | SS-CT-HYB.") message(FATAL_ERROR "Please set DCA_CLUSTER_SOLVER to a valid option: CT-AUX | CT_INT | SS-CT-HYB.") endif() ################################################################################ Loading Loading @@ -312,6 +327,15 @@ configure_file("${PROJECT_SOURCE_DIR}/include/dca/config/mc_options.hpp.in" "${CMAKE_BINARY_DIR}/include/dca/config/mc_options.hpp" @ONLY) ################################################################################ # Symmetrization option(DCA_SYMMETRIZE "Apply cluster, time and frequency symmetries to single particle functions." ON) if(DCA_SYMMETRIZE) add_compile_definitions(DCA_WITH_SYMMETRIZATION) endif() ################################################################################ # Generate applications' config files. configure_file("${PROJECT_SOURCE_DIR}/include/dca/config/analysis.hpp.in" Loading include/dca/config/cmake_options.hpp +0 −1 Original line number Diff line number Diff line Loading @@ -26,7 +26,6 @@ struct CMakeOptions { // Parallelization static const std::string dca_with_mpi; static const std::string dca_with_threaded_solver; static const std::string dca_threading_library; // Others static const std::string dca_cluster_solver; Loading include/dca/linalg/lapack/magma.hpp +15 −15 Original line number Diff line number Diff line Loading @@ -124,7 +124,7 @@ inline magma_trans_t toMagmaTrans(const char x) { inline void magmablas_gemm_vbatched(const char transa, const char transb, int* m, int* n, int* k, const float alpha, const float* const* a, int* lda, const float* const* b, int* ldb, const float beta, float** c, int* ldc, const int batch_count, magma_queue_t& queue) { int* ldc, const int batch_count, magma_queue_t queue) { magmablas_sgemm_vbatched(toMagmaTrans(transa), toMagmaTrans(transb), m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_count, queue); checkErrorsCudaDebug(); Loading @@ -132,7 +132,7 @@ inline void magmablas_gemm_vbatched(const char transa, const char transb, int* m inline void magmablas_gemm_vbatched(const char transa, const char transb, int* m, int* n, int* k, const double alpha, const double* const* a, int* lda, const double* const* b, int* ldb, const double beta, double** c, int* ldc, const int batch_count, const magma_queue_t& queue) { int* ldc, const int batch_count, const magma_queue_t queue) { magmablas_dgemm_vbatched(toMagmaTrans(transa), toMagmaTrans(transb), m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_count, queue); checkErrorsCudaDebug(); Loading @@ -142,7 +142,7 @@ inline void magmablas_gemm_vbatched(const char transa, const char transb, int* m const std::complex<float>* const* a, int* lda, const std::complex<float>* const* b, int* ldb, const std::complex<float> beta, std::complex<float>** c, int* ldc, const int batch_count, const magma_queue_t& queue) { int* ldc, const int batch_count, const magma_queue_t queue) { using util::castCudaComplex; magmablas_cgemm_vbatched(toMagmaTrans(transa), toMagmaTrans(transb), m, n, k, *castCudaComplex(alpha), castCudaComplex(a), lda, castCudaComplex(b), Loading @@ -154,7 +154,7 @@ inline void magmablas_gemm_vbatched(const char transa, const char transb, int* m const std::complex<double>* const* a, int* lda, const std::complex<double>* const* b, int* ldb, const std::complex<double> beta, std::complex<double>** c, int* ldc, const int batch_count, const magma_queue_t& queue) { int* ldc, const int batch_count, const magma_queue_t queue) { using util::castCudaComplex; magmablas_zgemm_vbatched(toMagmaTrans(transa), toMagmaTrans(transb), m, n, k, *castCudaComplex(alpha), castCudaComplex(a), lda, castCudaComplex(b), Loading @@ -168,7 +168,7 @@ inline void magmablas_gemm_vbatched_max_nocheck(const char transa, const char tr const float* const* b, int* ldb, const float beta, float** c, int* ldc, const int batch_count, const int m_max, const int n_max, const int k_max, magma_queue_t& queue) { magma_queue_t queue) { magmablas_sgemm_vbatched_max_nocheck(toMagmaTrans(transa), toMagmaTrans(transb), m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_count, m_max, n_max, k_max, queue); Loading @@ -181,7 +181,7 @@ inline void magmablas_gemm_vbatched_max_nocheck(const char transa, const char tr const double* const* b, int* ldb, const double beta, double** c, int* ldc, const int batch_count, const int m_max, const int n_max, const int k_max, magma_queue_t& queue) { magma_queue_t queue) { magmablas_dgemm_vbatched_max_nocheck(toMagmaTrans(transa), toMagmaTrans(transb), m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_count, m_max, n_max, k_max, queue); Loading @@ -192,7 +192,7 @@ inline void magmablas_gemm_vbatched_max_nocheck( const char transa, const char transb, int* m, int* n, int* k, const std::complex<float> alpha, const std::complex<float>* const* a, int* lda, const std::complex<float>* const* b, int* ldb, const std::complex<float> beta, std::complex<float>** c, int* ldc, const int batch_count, const int m_max, const int n_max, const int k_max, magma_queue_t& queue) { const int m_max, const int n_max, const int k_max, magma_queue_t queue) { using util::castCudaComplex; magmablas_cgemm_vbatched_max_nocheck( toMagmaTrans(transa), toMagmaTrans(transb), m, n, k, *castCudaComplex(alpha), Loading @@ -205,7 +205,7 @@ inline void magmablas_gemm_vbatched_max_nocheck( const char transa, const char transb, int* m, int* n, int* k, const std::complex<double> alpha, const std::complex<double>* const* a, int* lda, const std::complex<double>* const* b, int* ldb, const std::complex<double> beta, std::complex<double>** c, int* ldc, const int batch_count, const int m_max, const int n_max, const int k_max, magma_queue_t& queue) { const int m_max, const int n_max, const int k_max, magma_queue_t queue) { using util::castCudaComplex; magmablas_zgemm_vbatched_max_nocheck( toMagmaTrans(transa), toMagmaTrans(transb), m, n, k, *castCudaComplex(alpha), Loading @@ -218,7 +218,7 @@ inline void magmablas_gemm_batched(const char transa, const char transb, const i const int k, const float alpha, const float* const* a, const int lda, const float* const* b, const int ldb, const float beta, float** c, const int ldc, const int batch_count, magma_queue_t& queue) { const int batch_count, magma_queue_t queue) { magmablas_sgemm_batched(toMagmaTrans(transa), toMagmaTrans(transb), m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_count, queue); checkErrorsCudaDebug(); Loading @@ -227,7 +227,7 @@ inline void magmablas_gemm_batched(const char transa, const char transb, const i const int k, const double alpha, const double* const* a, const int lda, const double* const* b, const int ldb, const double beta, double** c, const int ldc, const int batch_count, const magma_queue_t& queue) { const int batch_count, const magma_queue_t queue) { magmablas_dgemm_batched(toMagmaTrans(transa), toMagmaTrans(transb), m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_count, queue); checkErrorsCudaDebug(); Loading @@ -237,7 +237,7 @@ inline void magmablas_gemm_batched(const char transa, const char transb, const i const std::complex<float>* const* a, const int lda, const std::complex<float>* const* b, const int ldb, const std::complex<float> beta, std::complex<float>** c, const int ldc, const int batch_count, const magma_queue_t& queue) { const int ldc, const int batch_count, const magma_queue_t queue) { using util::castCudaComplex; magmablas_cgemm_batched(toMagmaTrans(transa), toMagmaTrans(transb), m, n, k, *castCudaComplex(alpha), castCudaComplex(a), lda, castCudaComplex(b), ldb, Loading @@ -249,7 +249,7 @@ inline void magmablas_gemm_batched(const char transa, const char transb, const i const std::complex<double>* const* a, const int lda, const std::complex<double>* const* b, const int ldb, const std::complex<double> beta, std::complex<double>** c, const int ldc, const int batch_count, const magma_queue_t& queue) { const int ldc, const int batch_count, const magma_queue_t queue) { using util::castCudaComplex; magmablas_zgemm_batched(toMagmaTrans(transa), toMagmaTrans(transb), m, n, k, *castCudaComplex(alpha), castCudaComplex(a), lda, castCudaComplex(b), ldb, Loading @@ -276,8 +276,8 @@ inline int get_getri_nb<std::complex<double>>(int n) { return magma_get_zgetri_nb(n); } } // magma } // linalg } // dca } // namespace magma } // namespace linalg } // namespace dca #endif // DCA_LINALG_LAPACK_MAGMA_HPP include/dca/linalg/util/cuda_stream.hpp +11 −2 Original line number Diff line number Diff line Loading @@ -31,9 +31,14 @@ public: } CudaStream(const CudaStream& other) = delete; CudaStream& operator=(const CudaStream& other) = delete; CudaStream(CudaStream&& other) { std::swap(stream_, other.stream_); CudaStream(CudaStream&& other) noexcept { swap(other); } CudaStream& operator=(CudaStream&& other) noexcept { swap(other); return *this; } void sync() const { Loading @@ -49,6 +54,10 @@ public: return stream_; } void swap(CudaStream& other) noexcept { std::swap(stream_, other.stream_); } private: cudaStream_t stream_ = nullptr; }; Loading include/dca/linalg/util/magma_batched_gemm.hpp +15 −14 Original line number Diff line number Diff line Loading @@ -19,6 +19,7 @@ #include "dca/linalg/lapack/magma.hpp" #include "dca/linalg/util/allocators/vectors_typedefs.hpp" #include "dca/linalg/util/cuda_event.hpp" #include "dca/linalg/util/magma_queue.hpp" #include "dca/linalg/vector.hpp" namespace dca { Loading @@ -30,7 +31,7 @@ template <typename ScalarType> class MagmaBatchedGemm { public: // Creates a plan for a batched gemm. MagmaBatchedGemm(magma_queue_t queue); MagmaBatchedGemm(const linalg::util::MagmaQueue& queue); // Creates a plan for a batched gemm and allocates the memory for the arguments of `size` // multiplications. MagmaBatchedGemm(int size, magma_queue_t queue); Loading @@ -52,8 +53,7 @@ public: void synchronizeCopy(); private: magma_queue_t queue_; const cudaStream_t stream_; const linalg::util::MagmaQueue& queue_; CudaEvent copied_; linalg::util::HostVector<const ScalarType*> a_ptr_, b_ptr_; Loading @@ -64,8 +64,8 @@ private: }; template <typename ScalarType> MagmaBatchedGemm<ScalarType>::MagmaBatchedGemm(magma_queue_t queue) : queue_(queue), stream_(magma_queue_get_cuda_stream(queue_)) {} MagmaBatchedGemm<ScalarType>::MagmaBatchedGemm(const linalg::util::MagmaQueue& queue) : queue_(queue) {} template <typename ScalarType> MagmaBatchedGemm<ScalarType>::MagmaBatchedGemm(const int size, magma_queue_t queue) Loading Loading @@ -99,10 +99,11 @@ void MagmaBatchedGemm<ScalarType>::execute(const char transa, const char transb, const int n, const int k, const ScalarType alpha, const ScalarType beta, const int lda, const int ldb, const int ldc) { a_ptr_dev_.setAsync(a_ptr_, stream_); b_ptr_dev_.setAsync(b_ptr_, stream_); c_ptr_dev_.setAsync(c_ptr_, stream_); copied_.record(stream_); // TODO: store in a buffer if the performance gain is necessary. a_ptr_dev_.setAsync(a_ptr_, queue_); b_ptr_dev_.setAsync(b_ptr_, queue_); c_ptr_dev_.setAsync(c_ptr_, queue_); copied_.record(queue_); const int n_batched = a_ptr_.size(); magma::magmablas_gemm_batched(transa, transb, m, n, k, alpha, a_ptr_dev_.ptr(), lda, Loading @@ -111,9 +112,9 @@ void MagmaBatchedGemm<ScalarType>::execute(const char transa, const char transb, assert(cudaPeekAtLastError() == cudaSuccess); } } // util } // linalg } // dca } // namespace util } // namespace linalg } // namespace dca #endif // DCA_HAVE_CUDA #endif // DCA_LINALG_UTIL_MAGMA_BATCHED_GEMM_HPP Loading
cmake/dca_config.cmake +30 −6 Original line number Diff line number Diff line Loading @@ -201,15 +201,29 @@ configure_file("${PROJECT_SOURCE_DIR}/include/dca/config/rng.hpp.in" ################################################################################ # Select the cluster solver. set(DCA_CLUSTER_SOLVER "CT-AUX" CACHE STRING "The cluster solver for the DCA(+) loop. Options are: CT-AUX | SS-CT-HYB.") set_property(CACHE DCA_CLUSTER_SOLVER PROPERTY STRINGS CT-AUX SS-CT-HYB) "The cluster solver for the DCA(+) loop. Options are: CT-AUX | CT-INT | SS-CT-HYB.") set_property(CACHE DCA_CLUSTER_SOLVER PROPERTY STRINGS CT-AUX CT-INT SS-CT-HYB) if (DCA_CLUSTER_SOLVER STREQUAL "CT-AUX") if (DCA_CLUSTER_SOLVER STREQUAL "CT-INT") set(DCA_CLUSTER_SOLVER_NAME dca::phys::solver::CT_INT) set(DCA_CLUSTER_SOLVER_INCLUDE "dca/phys/dca_step/cluster_solver/ctint/ctint_cluster_solver.hpp") set(DCA_USE_CTINT_SUBMATRIX ON CACHE BOOL "Use submatrix updates if the CT-INT solver is selected.") if(DCA_USE_CTINT_SUBMATRIX) set(DCA_CLUSTER_SOLVER_TYPE "dca::phys::solver::CtintClusterSolver<walker_device, ParametersType, true>") else() set(DCA_CLUSTER_SOLVER_TYPE "dca::phys::solver::CtintClusterSolver<walker_device, ParametersType, false>") endif() elseif (DCA_CLUSTER_SOLVER STREQUAL "CT-AUX") set(DCA_CLUSTER_SOLVER_NAME dca::phys::solver::CT_AUX) set(DCA_CLUSTER_SOLVER_TYPE "dca::phys::solver::CtauxClusterSolver<walker_device, ParametersType, DcaDataType, DIST>") set(DCA_CLUSTER_SOLVER_INCLUDE "dca/phys/dca_step/cluster_solver/ctaux/ctaux_cluster_solver.hpp") elseif (DCA_CLUSTER_SOLVER STREQUAL "SS-CT-HYB") set(DCA_CLUSTER_SOLVER_NAME dca::phys::solver::SS_CT_HYB) set(DCA_CLUSTER_SOLVER_TYPE "dca::phys::solver::SsCtHybClusterSolver<walker_device, ParametersType, DcaDataType, DIST>") Loading @@ -222,7 +236,8 @@ elseif (DCA_CLUSTER_SOLVER STREQUAL "SS-CT-HYB") # "dca/phys/dca_step/cluster_solver/high_temperature_series_expansion/high_temperature_series_expansion_solver.hpp") else() message(FATAL_ERROR "Please set DCA_CLUSTER_SOLVER to a valid option: CT-AUX | SS-CT-HYB.") message(FATAL_ERROR "Please set DCA_CLUSTER_SOLVER to a valid option: CT-AUX | CT_INT | SS-CT-HYB.") endif() ################################################################################ Loading Loading @@ -312,6 +327,15 @@ configure_file("${PROJECT_SOURCE_DIR}/include/dca/config/mc_options.hpp.in" "${CMAKE_BINARY_DIR}/include/dca/config/mc_options.hpp" @ONLY) ################################################################################ # Symmetrization option(DCA_SYMMETRIZE "Apply cluster, time and frequency symmetries to single particle functions." ON) if(DCA_SYMMETRIZE) add_compile_definitions(DCA_WITH_SYMMETRIZATION) endif() ################################################################################ # Generate applications' config files. configure_file("${PROJECT_SOURCE_DIR}/include/dca/config/analysis.hpp.in" Loading
include/dca/config/cmake_options.hpp +0 −1 Original line number Diff line number Diff line Loading @@ -26,7 +26,6 @@ struct CMakeOptions { // Parallelization static const std::string dca_with_mpi; static const std::string dca_with_threaded_solver; static const std::string dca_threading_library; // Others static const std::string dca_cluster_solver; Loading
include/dca/linalg/lapack/magma.hpp +15 −15 Original line number Diff line number Diff line Loading @@ -124,7 +124,7 @@ inline magma_trans_t toMagmaTrans(const char x) { inline void magmablas_gemm_vbatched(const char transa, const char transb, int* m, int* n, int* k, const float alpha, const float* const* a, int* lda, const float* const* b, int* ldb, const float beta, float** c, int* ldc, const int batch_count, magma_queue_t& queue) { int* ldc, const int batch_count, magma_queue_t queue) { magmablas_sgemm_vbatched(toMagmaTrans(transa), toMagmaTrans(transb), m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_count, queue); checkErrorsCudaDebug(); Loading @@ -132,7 +132,7 @@ inline void magmablas_gemm_vbatched(const char transa, const char transb, int* m inline void magmablas_gemm_vbatched(const char transa, const char transb, int* m, int* n, int* k, const double alpha, const double* const* a, int* lda, const double* const* b, int* ldb, const double beta, double** c, int* ldc, const int batch_count, const magma_queue_t& queue) { int* ldc, const int batch_count, const magma_queue_t queue) { magmablas_dgemm_vbatched(toMagmaTrans(transa), toMagmaTrans(transb), m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_count, queue); checkErrorsCudaDebug(); Loading @@ -142,7 +142,7 @@ inline void magmablas_gemm_vbatched(const char transa, const char transb, int* m const std::complex<float>* const* a, int* lda, const std::complex<float>* const* b, int* ldb, const std::complex<float> beta, std::complex<float>** c, int* ldc, const int batch_count, const magma_queue_t& queue) { int* ldc, const int batch_count, const magma_queue_t queue) { using util::castCudaComplex; magmablas_cgemm_vbatched(toMagmaTrans(transa), toMagmaTrans(transb), m, n, k, *castCudaComplex(alpha), castCudaComplex(a), lda, castCudaComplex(b), Loading @@ -154,7 +154,7 @@ inline void magmablas_gemm_vbatched(const char transa, const char transb, int* m const std::complex<double>* const* a, int* lda, const std::complex<double>* const* b, int* ldb, const std::complex<double> beta, std::complex<double>** c, int* ldc, const int batch_count, const magma_queue_t& queue) { int* ldc, const int batch_count, const magma_queue_t queue) { using util::castCudaComplex; magmablas_zgemm_vbatched(toMagmaTrans(transa), toMagmaTrans(transb), m, n, k, *castCudaComplex(alpha), castCudaComplex(a), lda, castCudaComplex(b), Loading @@ -168,7 +168,7 @@ inline void magmablas_gemm_vbatched_max_nocheck(const char transa, const char tr const float* const* b, int* ldb, const float beta, float** c, int* ldc, const int batch_count, const int m_max, const int n_max, const int k_max, magma_queue_t& queue) { magma_queue_t queue) { magmablas_sgemm_vbatched_max_nocheck(toMagmaTrans(transa), toMagmaTrans(transb), m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_count, m_max, n_max, k_max, queue); Loading @@ -181,7 +181,7 @@ inline void magmablas_gemm_vbatched_max_nocheck(const char transa, const char tr const double* const* b, int* ldb, const double beta, double** c, int* ldc, const int batch_count, const int m_max, const int n_max, const int k_max, magma_queue_t& queue) { magma_queue_t queue) { magmablas_dgemm_vbatched_max_nocheck(toMagmaTrans(transa), toMagmaTrans(transb), m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_count, m_max, n_max, k_max, queue); Loading @@ -192,7 +192,7 @@ inline void magmablas_gemm_vbatched_max_nocheck( const char transa, const char transb, int* m, int* n, int* k, const std::complex<float> alpha, const std::complex<float>* const* a, int* lda, const std::complex<float>* const* b, int* ldb, const std::complex<float> beta, std::complex<float>** c, int* ldc, const int batch_count, const int m_max, const int n_max, const int k_max, magma_queue_t& queue) { const int m_max, const int n_max, const int k_max, magma_queue_t queue) { using util::castCudaComplex; magmablas_cgemm_vbatched_max_nocheck( toMagmaTrans(transa), toMagmaTrans(transb), m, n, k, *castCudaComplex(alpha), Loading @@ -205,7 +205,7 @@ inline void magmablas_gemm_vbatched_max_nocheck( const char transa, const char transb, int* m, int* n, int* k, const std::complex<double> alpha, const std::complex<double>* const* a, int* lda, const std::complex<double>* const* b, int* ldb, const std::complex<double> beta, std::complex<double>** c, int* ldc, const int batch_count, const int m_max, const int n_max, const int k_max, magma_queue_t& queue) { const int m_max, const int n_max, const int k_max, magma_queue_t queue) { using util::castCudaComplex; magmablas_zgemm_vbatched_max_nocheck( toMagmaTrans(transa), toMagmaTrans(transb), m, n, k, *castCudaComplex(alpha), Loading @@ -218,7 +218,7 @@ inline void magmablas_gemm_batched(const char transa, const char transb, const i const int k, const float alpha, const float* const* a, const int lda, const float* const* b, const int ldb, const float beta, float** c, const int ldc, const int batch_count, magma_queue_t& queue) { const int batch_count, magma_queue_t queue) { magmablas_sgemm_batched(toMagmaTrans(transa), toMagmaTrans(transb), m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_count, queue); checkErrorsCudaDebug(); Loading @@ -227,7 +227,7 @@ inline void magmablas_gemm_batched(const char transa, const char transb, const i const int k, const double alpha, const double* const* a, const int lda, const double* const* b, const int ldb, const double beta, double** c, const int ldc, const int batch_count, const magma_queue_t& queue) { const int batch_count, const magma_queue_t queue) { magmablas_dgemm_batched(toMagmaTrans(transa), toMagmaTrans(transb), m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_count, queue); checkErrorsCudaDebug(); Loading @@ -237,7 +237,7 @@ inline void magmablas_gemm_batched(const char transa, const char transb, const i const std::complex<float>* const* a, const int lda, const std::complex<float>* const* b, const int ldb, const std::complex<float> beta, std::complex<float>** c, const int ldc, const int batch_count, const magma_queue_t& queue) { const int ldc, const int batch_count, const magma_queue_t queue) { using util::castCudaComplex; magmablas_cgemm_batched(toMagmaTrans(transa), toMagmaTrans(transb), m, n, k, *castCudaComplex(alpha), castCudaComplex(a), lda, castCudaComplex(b), ldb, Loading @@ -249,7 +249,7 @@ inline void magmablas_gemm_batched(const char transa, const char transb, const i const std::complex<double>* const* a, const int lda, const std::complex<double>* const* b, const int ldb, const std::complex<double> beta, std::complex<double>** c, const int ldc, const int batch_count, const magma_queue_t& queue) { const int ldc, const int batch_count, const magma_queue_t queue) { using util::castCudaComplex; magmablas_zgemm_batched(toMagmaTrans(transa), toMagmaTrans(transb), m, n, k, *castCudaComplex(alpha), castCudaComplex(a), lda, castCudaComplex(b), ldb, Loading @@ -276,8 +276,8 @@ inline int get_getri_nb<std::complex<double>>(int n) { return magma_get_zgetri_nb(n); } } // magma } // linalg } // dca } // namespace magma } // namespace linalg } // namespace dca #endif // DCA_LINALG_LAPACK_MAGMA_HPP
include/dca/linalg/util/cuda_stream.hpp +11 −2 Original line number Diff line number Diff line Loading @@ -31,9 +31,14 @@ public: } CudaStream(const CudaStream& other) = delete; CudaStream& operator=(const CudaStream& other) = delete; CudaStream(CudaStream&& other) { std::swap(stream_, other.stream_); CudaStream(CudaStream&& other) noexcept { swap(other); } CudaStream& operator=(CudaStream&& other) noexcept { swap(other); return *this; } void sync() const { Loading @@ -49,6 +54,10 @@ public: return stream_; } void swap(CudaStream& other) noexcept { std::swap(stream_, other.stream_); } private: cudaStream_t stream_ = nullptr; }; Loading
include/dca/linalg/util/magma_batched_gemm.hpp +15 −14 Original line number Diff line number Diff line Loading @@ -19,6 +19,7 @@ #include "dca/linalg/lapack/magma.hpp" #include "dca/linalg/util/allocators/vectors_typedefs.hpp" #include "dca/linalg/util/cuda_event.hpp" #include "dca/linalg/util/magma_queue.hpp" #include "dca/linalg/vector.hpp" namespace dca { Loading @@ -30,7 +31,7 @@ template <typename ScalarType> class MagmaBatchedGemm { public: // Creates a plan for a batched gemm. MagmaBatchedGemm(magma_queue_t queue); MagmaBatchedGemm(const linalg::util::MagmaQueue& queue); // Creates a plan for a batched gemm and allocates the memory for the arguments of `size` // multiplications. MagmaBatchedGemm(int size, magma_queue_t queue); Loading @@ -52,8 +53,7 @@ public: void synchronizeCopy(); private: magma_queue_t queue_; const cudaStream_t stream_; const linalg::util::MagmaQueue& queue_; CudaEvent copied_; linalg::util::HostVector<const ScalarType*> a_ptr_, b_ptr_; Loading @@ -64,8 +64,8 @@ private: }; template <typename ScalarType> MagmaBatchedGemm<ScalarType>::MagmaBatchedGemm(magma_queue_t queue) : queue_(queue), stream_(magma_queue_get_cuda_stream(queue_)) {} MagmaBatchedGemm<ScalarType>::MagmaBatchedGemm(const linalg::util::MagmaQueue& queue) : queue_(queue) {} template <typename ScalarType> MagmaBatchedGemm<ScalarType>::MagmaBatchedGemm(const int size, magma_queue_t queue) Loading Loading @@ -99,10 +99,11 @@ void MagmaBatchedGemm<ScalarType>::execute(const char transa, const char transb, const int n, const int k, const ScalarType alpha, const ScalarType beta, const int lda, const int ldb, const int ldc) { a_ptr_dev_.setAsync(a_ptr_, stream_); b_ptr_dev_.setAsync(b_ptr_, stream_); c_ptr_dev_.setAsync(c_ptr_, stream_); copied_.record(stream_); // TODO: store in a buffer if the performance gain is necessary. a_ptr_dev_.setAsync(a_ptr_, queue_); b_ptr_dev_.setAsync(b_ptr_, queue_); c_ptr_dev_.setAsync(c_ptr_, queue_); copied_.record(queue_); const int n_batched = a_ptr_.size(); magma::magmablas_gemm_batched(transa, transb, m, n, k, alpha, a_ptr_dev_.ptr(), lda, Loading @@ -111,9 +112,9 @@ void MagmaBatchedGemm<ScalarType>::execute(const char transa, const char transb, assert(cudaPeekAtLastError() == cudaSuccess); } } // util } // linalg } // dca } // namespace util } // namespace linalg } // namespace dca #endif // DCA_HAVE_CUDA #endif // DCA_LINALG_UTIL_MAGMA_BATCHED_GEMM_HPP