Unverified Commit c3e0fb66 authored by Peter Doak's avatar Peter Doak Committed by GitHub
Browse files

Merge pull request #189 from gbalduzz/ct_int-solver

Ct int solver
parents 6fcfcb48 1a2c0792
Loading
Loading
Loading
Loading
+30 −6
Original line number Diff line number Diff line
@@ -201,15 +201,29 @@ configure_file("${PROJECT_SOURCE_DIR}/include/dca/config/rng.hpp.in"
################################################################################
# Select the cluster solver.
set(DCA_CLUSTER_SOLVER "CT-AUX" CACHE STRING
  "The cluster solver for the DCA(+) loop. Options are: CT-AUX | SS-CT-HYB.")
set_property(CACHE DCA_CLUSTER_SOLVER PROPERTY STRINGS CT-AUX SS-CT-HYB)
  "The cluster solver for the DCA(+) loop. Options are: CT-AUX | CT-INT | SS-CT-HYB.")
set_property(CACHE DCA_CLUSTER_SOLVER PROPERTY STRINGS CT-AUX CT-INT SS-CT-HYB)

if (DCA_CLUSTER_SOLVER STREQUAL "CT-AUX")
if (DCA_CLUSTER_SOLVER STREQUAL "CT-INT")
  set(DCA_CLUSTER_SOLVER_NAME dca::phys::solver::CT_INT)
  set(DCA_CLUSTER_SOLVER_INCLUDE "dca/phys/dca_step/cluster_solver/ctint/ctint_cluster_solver.hpp")

  set(DCA_USE_CTINT_SUBMATRIX ON CACHE BOOL "Use submatrix updates if the CT-INT solver is selected.")
  if(DCA_USE_CTINT_SUBMATRIX)
    set(DCA_CLUSTER_SOLVER_TYPE
            "dca::phys::solver::CtintClusterSolver<walker_device, ParametersType, true>")
  else()
    set(DCA_CLUSTER_SOLVER_TYPE
            "dca::phys::solver::CtintClusterSolver<walker_device, ParametersType, false>")
  endif()

elseif (DCA_CLUSTER_SOLVER STREQUAL "CT-AUX")
  set(DCA_CLUSTER_SOLVER_NAME dca::phys::solver::CT_AUX)
  set(DCA_CLUSTER_SOLVER_TYPE "dca::phys::solver::CtauxClusterSolver<walker_device, ParametersType, DcaDataType, DIST>")
  set(DCA_CLUSTER_SOLVER_INCLUDE
      "dca/phys/dca_step/cluster_solver/ctaux/ctaux_cluster_solver.hpp")


elseif (DCA_CLUSTER_SOLVER STREQUAL "SS-CT-HYB")
  set(DCA_CLUSTER_SOLVER_NAME dca::phys::solver::SS_CT_HYB)
  set(DCA_CLUSTER_SOLVER_TYPE "dca::phys::solver::SsCtHybClusterSolver<walker_device, ParametersType, DcaDataType, DIST>")
@@ -222,7 +236,8 @@ elseif (DCA_CLUSTER_SOLVER STREQUAL "SS-CT-HYB")
#     "dca/phys/dca_step/cluster_solver/high_temperature_series_expansion/high_temperature_series_expansion_solver.hpp")

else()
  message(FATAL_ERROR "Please set DCA_CLUSTER_SOLVER to a valid option: CT-AUX | SS-CT-HYB.")
  message(FATAL_ERROR "Please set DCA_CLUSTER_SOLVER to a valid option: CT-AUX | CT_INT |
          SS-CT-HYB.")
endif()

################################################################################
@@ -312,6 +327,15 @@ configure_file("${PROJECT_SOURCE_DIR}/include/dca/config/mc_options.hpp.in"
        "${CMAKE_BINARY_DIR}/include/dca/config/mc_options.hpp" @ONLY)


################################################################################
# Symmetrization
option(DCA_SYMMETRIZE "Apply cluster, time and frequency symmetries to single particle functions."
       ON)

if(DCA_SYMMETRIZE)
  add_compile_definitions(DCA_WITH_SYMMETRIZATION)
endif()

################################################################################
# Generate applications' config files.
configure_file("${PROJECT_SOURCE_DIR}/include/dca/config/analysis.hpp.in"
+0 −1
Original line number Diff line number Diff line
@@ -26,7 +26,6 @@ struct CMakeOptions {
  // Parallelization
  static const std::string dca_with_mpi;
  static const std::string dca_with_threaded_solver;
  static const std::string dca_threading_library;

  // Others
  static const std::string dca_cluster_solver;
+15 −15
Original line number Diff line number Diff line
@@ -124,7 +124,7 @@ inline magma_trans_t toMagmaTrans(const char x) {
inline void magmablas_gemm_vbatched(const char transa, const char transb, int* m, int* n, int* k,
                                    const float alpha, const float* const* a, int* lda,
                                    const float* const* b, int* ldb, const float beta, float** c,
                                    int* ldc, const int batch_count, magma_queue_t& queue) {
                                    int* ldc, const int batch_count, magma_queue_t queue) {
  magmablas_sgemm_vbatched(toMagmaTrans(transa), toMagmaTrans(transb), m, n, k, alpha, a, lda, b,
                           ldb, beta, c, ldc, batch_count, queue);
  checkErrorsCudaDebug();
@@ -132,7 +132,7 @@ inline void magmablas_gemm_vbatched(const char transa, const char transb, int* m
inline void magmablas_gemm_vbatched(const char transa, const char transb, int* m, int* n, int* k,
                                    const double alpha, const double* const* a, int* lda,
                                    const double* const* b, int* ldb, const double beta, double** c,
                                    int* ldc, const int batch_count, const magma_queue_t& queue) {
                                    int* ldc, const int batch_count, const magma_queue_t queue) {
  magmablas_dgemm_vbatched(toMagmaTrans(transa), toMagmaTrans(transb), m, n, k, alpha, a, lda, b,
                           ldb, beta, c, ldc, batch_count, queue);
  checkErrorsCudaDebug();
@@ -142,7 +142,7 @@ inline void magmablas_gemm_vbatched(const char transa, const char transb, int* m
                                    const std::complex<float>* const* a, int* lda,
                                    const std::complex<float>* const* b, int* ldb,
                                    const std::complex<float> beta, std::complex<float>** c,
                                    int* ldc, const int batch_count, const magma_queue_t& queue) {
                                    int* ldc, const int batch_count, const magma_queue_t queue) {
  using util::castCudaComplex;
  magmablas_cgemm_vbatched(toMagmaTrans(transa), toMagmaTrans(transb), m, n, k,
                           *castCudaComplex(alpha), castCudaComplex(a), lda, castCudaComplex(b),
@@ -154,7 +154,7 @@ inline void magmablas_gemm_vbatched(const char transa, const char transb, int* m
                                    const std::complex<double>* const* a, int* lda,
                                    const std::complex<double>* const* b, int* ldb,
                                    const std::complex<double> beta, std::complex<double>** c,
                                    int* ldc, const int batch_count, const magma_queue_t& queue) {
                                    int* ldc, const int batch_count, const magma_queue_t queue) {
  using util::castCudaComplex;
  magmablas_zgemm_vbatched(toMagmaTrans(transa), toMagmaTrans(transb), m, n, k,
                           *castCudaComplex(alpha), castCudaComplex(a), lda, castCudaComplex(b),
@@ -168,7 +168,7 @@ inline void magmablas_gemm_vbatched_max_nocheck(const char transa, const char tr
                                                const float* const* b, int* ldb, const float beta,
                                                float** c, int* ldc, const int batch_count,
                                                const int m_max, const int n_max, const int k_max,
                                                magma_queue_t& queue) {
                                                magma_queue_t queue) {
  magmablas_sgemm_vbatched_max_nocheck(toMagmaTrans(transa), toMagmaTrans(transb), m, n, k, alpha,
                                       a, lda, b, ldb, beta, c, ldc, batch_count, m_max, n_max,
                                       k_max, queue);
@@ -181,7 +181,7 @@ inline void magmablas_gemm_vbatched_max_nocheck(const char transa, const char tr
                                                const double* const* b, int* ldb, const double beta,
                                                double** c, int* ldc, const int batch_count,
                                                const int m_max, const int n_max, const int k_max,
                                                magma_queue_t& queue) {
                                                magma_queue_t queue) {
  magmablas_dgemm_vbatched_max_nocheck(toMagmaTrans(transa), toMagmaTrans(transb), m, n, k, alpha,
                                       a, lda, b, ldb, beta, c, ldc, batch_count, m_max, n_max,
                                       k_max, queue);
@@ -192,7 +192,7 @@ inline void magmablas_gemm_vbatched_max_nocheck(
    const char transa, const char transb, int* m, int* n, int* k, const std::complex<float> alpha,
    const std::complex<float>* const* a, int* lda, const std::complex<float>* const* b, int* ldb,
    const std::complex<float> beta, std::complex<float>** c, int* ldc, const int batch_count,
    const int m_max, const int n_max, const int k_max, magma_queue_t& queue) {
    const int m_max, const int n_max, const int k_max, magma_queue_t queue) {
  using util::castCudaComplex;
  magmablas_cgemm_vbatched_max_nocheck(
      toMagmaTrans(transa), toMagmaTrans(transb), m, n, k, *castCudaComplex(alpha),
@@ -205,7 +205,7 @@ inline void magmablas_gemm_vbatched_max_nocheck(
    const char transa, const char transb, int* m, int* n, int* k, const std::complex<double> alpha,
    const std::complex<double>* const* a, int* lda, const std::complex<double>* const* b, int* ldb,
    const std::complex<double> beta, std::complex<double>** c, int* ldc, const int batch_count,
    const int m_max, const int n_max, const int k_max, magma_queue_t& queue) {
    const int m_max, const int n_max, const int k_max, magma_queue_t queue) {
  using util::castCudaComplex;
  magmablas_zgemm_vbatched_max_nocheck(
      toMagmaTrans(transa), toMagmaTrans(transb), m, n, k, *castCudaComplex(alpha),
@@ -218,7 +218,7 @@ inline void magmablas_gemm_batched(const char transa, const char transb, const i
                                   const int k, const float alpha, const float* const* a,
                                   const int lda, const float* const* b, const int ldb,
                                   const float beta, float** c, const int ldc,
                                   const int batch_count, magma_queue_t& queue) {
                                   const int batch_count, magma_queue_t queue) {
  magmablas_sgemm_batched(toMagmaTrans(transa), toMagmaTrans(transb), m, n, k, alpha, a, lda, b,
                          ldb, beta, c, ldc, batch_count, queue);
  checkErrorsCudaDebug();
@@ -227,7 +227,7 @@ inline void magmablas_gemm_batched(const char transa, const char transb, const i
                                   const int k, const double alpha, const double* const* a,
                                   const int lda, const double* const* b, const int ldb,
                                   const double beta, double** c, const int ldc,
                                   const int batch_count, const magma_queue_t& queue) {
                                   const int batch_count, const magma_queue_t queue) {
  magmablas_dgemm_batched(toMagmaTrans(transa), toMagmaTrans(transb), m, n, k, alpha, a, lda, b,
                          ldb, beta, c, ldc, batch_count, queue);
  checkErrorsCudaDebug();
@@ -237,7 +237,7 @@ inline void magmablas_gemm_batched(const char transa, const char transb, const i
                                   const std::complex<float>* const* a, const int lda,
                                   const std::complex<float>* const* b, const int ldb,
                                   const std::complex<float> beta, std::complex<float>** c,
                                   const int ldc, const int batch_count, const magma_queue_t& queue) {
                                   const int ldc, const int batch_count, const magma_queue_t queue) {
  using util::castCudaComplex;
  magmablas_cgemm_batched(toMagmaTrans(transa), toMagmaTrans(transb), m, n, k,
                          *castCudaComplex(alpha), castCudaComplex(a), lda, castCudaComplex(b), ldb,
@@ -249,7 +249,7 @@ inline void magmablas_gemm_batched(const char transa, const char transb, const i
                                   const std::complex<double>* const* a, const int lda,
                                   const std::complex<double>* const* b, const int ldb,
                                   const std::complex<double> beta, std::complex<double>** c,
                                   const int ldc, const int batch_count, const magma_queue_t& queue) {
                                   const int ldc, const int batch_count, const magma_queue_t queue) {
  using util::castCudaComplex;
  magmablas_zgemm_batched(toMagmaTrans(transa), toMagmaTrans(transb), m, n, k,
                          *castCudaComplex(alpha), castCudaComplex(a), lda, castCudaComplex(b), ldb,
@@ -276,8 +276,8 @@ inline int get_getri_nb<std::complex<double>>(int n) {
  return magma_get_zgetri_nb(n);
}

}  // magma
}  // linalg
}  // dca
}  // namespace magma
}  // namespace linalg
}  // namespace dca

#endif  // DCA_LINALG_LAPACK_MAGMA_HPP
+11 −2
Original line number Diff line number Diff line
@@ -31,9 +31,14 @@ public:
  }

  CudaStream(const CudaStream& other) = delete;
  CudaStream& operator=(const CudaStream& other) = delete;

  CudaStream(CudaStream&& other) {
    std::swap(stream_, other.stream_);
  CudaStream(CudaStream&& other) noexcept {
    swap(other);
  }
  CudaStream& operator=(CudaStream&& other) noexcept {
    swap(other);
    return *this;
  }

  void sync() const {
@@ -49,6 +54,10 @@ public:
    return stream_;
  }

  void swap(CudaStream& other) noexcept {
    std::swap(stream_, other.stream_);
  }

private:
  cudaStream_t stream_ = nullptr;
};
+15 −14
Original line number Diff line number Diff line
@@ -19,6 +19,7 @@
#include "dca/linalg/lapack/magma.hpp"
#include "dca/linalg/util/allocators/vectors_typedefs.hpp"
#include "dca/linalg/util/cuda_event.hpp"
#include "dca/linalg/util/magma_queue.hpp"
#include "dca/linalg/vector.hpp"

namespace dca {
@@ -30,7 +31,7 @@ template <typename ScalarType>
class MagmaBatchedGemm {
public:
  // Creates a plan for a batched gemm.
  MagmaBatchedGemm(magma_queue_t queue);
  MagmaBatchedGemm(const linalg::util::MagmaQueue& queue);
  // Creates a plan for a batched gemm and allocates the memory for the arguments of `size`
  // multiplications.
  MagmaBatchedGemm(int size, magma_queue_t queue);
@@ -52,8 +53,7 @@ public:
  void synchronizeCopy();

private:
  magma_queue_t queue_;
  const cudaStream_t stream_;
  const linalg::util::MagmaQueue& queue_;
  CudaEvent copied_;

  linalg::util::HostVector<const ScalarType*> a_ptr_, b_ptr_;
@@ -64,8 +64,8 @@ private:
};

template <typename ScalarType>
MagmaBatchedGemm<ScalarType>::MagmaBatchedGemm(magma_queue_t queue)
    : queue_(queue), stream_(magma_queue_get_cuda_stream(queue_)) {}
MagmaBatchedGemm<ScalarType>::MagmaBatchedGemm(const linalg::util::MagmaQueue& queue)
    : queue_(queue) {}

template <typename ScalarType>
MagmaBatchedGemm<ScalarType>::MagmaBatchedGemm(const int size, magma_queue_t queue)
@@ -99,10 +99,11 @@ void MagmaBatchedGemm<ScalarType>::execute(const char transa, const char transb,
                                           const int n, const int k, const ScalarType alpha,
                                           const ScalarType beta, const int lda, const int ldb,
                                           const int ldc) {
  a_ptr_dev_.setAsync(a_ptr_, stream_);
  b_ptr_dev_.setAsync(b_ptr_, stream_);
  c_ptr_dev_.setAsync(c_ptr_, stream_);
  copied_.record(stream_);
  // TODO: store in a buffer if the performance gain is necessary.
  a_ptr_dev_.setAsync(a_ptr_, queue_);
  b_ptr_dev_.setAsync(b_ptr_, queue_);
  c_ptr_dev_.setAsync(c_ptr_, queue_);
  copied_.record(queue_);

  const int n_batched = a_ptr_.size();
  magma::magmablas_gemm_batched(transa, transb, m, n, k, alpha, a_ptr_dev_.ptr(), lda,
@@ -111,9 +112,9 @@ void MagmaBatchedGemm<ScalarType>::execute(const char transa, const char transb,
  assert(cudaPeekAtLastError() == cudaSuccess);
}

}  // util
}  // linalg
}  // dca
}  // namespace util
}  // namespace linalg
}  // namespace dca

#endif  // DCA_HAVE_CUDA
#endif  // DCA_LINALG_UTIL_MAGMA_BATCHED_GEMM_HPP
Loading