Loading include/dca/math/function_transform/special_transforms/space_transform_2D.hpp +4 −3 Original line number Diff line number Diff line Loading @@ -59,13 +59,14 @@ void SpaceTransform2D<RDmn, KDmn, Real>::execute( const int nc = RDmn::dmn_size(); const int nc2 = nc * nc; linalg::Matrix<Complex, linalg::CPU> tmp(nc); const Complex norm = Complex(1. / nc); const auto& T = get_T_matrix(); for (int l = 0; l < other_size; ++l) { linalg::MatrixView<Complex, linalg::CPU> f_r_r(&f_input(l * nc2), nc); // f(k1,k2) = \sum exp(i(k1 * r1 - k2 *r2)) f(r1, r2) linalg::matrixop::gemm(T, f_r_r, tmp); linalg::matrixop::gemm('N', 'C', Complex(1), tmp, T, Complex(0), f_r_r); linalg::matrixop::gemm('N', 'C', norm, tmp, T, Complex(0), f_r_r); } } Loading @@ -88,11 +89,11 @@ void SpaceTransform2D<RDmn, KDmn, Real>::execute( linalg::MatrixView<Complex, linalg::CPU> f_r_r(&f_input(0, 0, b1, b2, s, w1, w2), nc); // f(k1,k2) = \sum exp(i(k1 * r1 - k2 *r2)) f(r1, r2) linalg::matrixop::gemm(T, f_r_r, tmp); linalg::matrixop::gemm('N', 'C', Complex(1), tmp, T, Complex(0), f_r_r); linalg::matrixop::gemm('N', 'C', norm, tmp, T, Complex(0), f_r_r); for (int k2 = 0; k2 < nc; ++k2) for (int k1 = 0; k1 < nc; ++k1) f_output(b1, b2, s, k1, k2, w1, w2) = f_r_r(k1, k2) * norm; f_output(b1, b2, s, k1, k2, w1, w2) = f_r_r(k1, k2); } } Loading include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/g4_helper.cuh +45 −16 Original line number Diff line number Diff line Loading @@ -9,14 +9,18 @@ // // Helper class for adding and subtracting momentum and frequency on the device. // TODO: split G4Helper and G4HelperManager into two separate file. #ifndef DCA_INCLUDE_DCA_PHYS_DCA_STEP_CLUSTER_SOLVER_SHARED_TOOLS_ACCUMULATION_TP_G4_HELPER_CUH #define DCA_INCLUDE_DCA_PHYS_DCA_STEP_CLUSTER_SOLVER_SHARED_TOOLS_ACCUMULATION_TP_G4_HELPER_CUH #include <array> #include <cuda.h> #include <mutex> #include <vector> #include <stdexcept> #include <cuda.h> namespace dca { namespace phys { namespace solver { Loading @@ -28,7 +32,7 @@ class G4Helper { public: __host__ void set(int nb, int nk, int nw_pos, const std::vector<int>& k_ex_indices, const std::vector<int>& w_ex_indices, const int* add_k, int lda, const int* sub_k, int lds); const int* sub_k, int lds, int k0); G4Helper(const G4Helper& other) = default; Loading @@ -40,6 +44,8 @@ public: __device__ inline int addKex(int k_idx, int k_ex_idx) const; // Returns the index of k_ex - k. __device__ inline int kexMinus(int k_idx, int k_ex_idx) const; // Returns the index of -k. __device__ inline int kMinus(int k_idx) const; // Returns the index of w + w_ex. __device__ inline int addWex(int w_idx, int w_ex_idx) const; // Returns the index of w_ex - w. Loading @@ -49,7 +55,7 @@ public: // to the extended (positive for w1) domain used by G. // In/Out: w1, w2. // Returns: true if G(w1, w2) is stored as a complex conjugate. __device__ inline bool extendWIndices(int& w1, int& w2) const; __device__ inline bool extendGIndices(int& k1, int& k2, int& w1, int& w2) const; // Returns the linear index of G4 as a function of // band, band, band, band, k1, k2, k_ex, w1, w2, w_ex. Loading @@ -71,6 +77,7 @@ protected: // lds: leading dimension of sub_matrix_. // nw_pos: number of positive frequencies stored in G4. // ext_size: difference between the number of positive frequencies stored in G and G4. // k0: index of the origin. int* device_members_ = nullptr; int* w_ex_indices_ = nullptr; Loading @@ -85,11 +92,25 @@ public: G4HelperManager() = default; __host__ ~G4HelperManager(); __host__ __device__ G4HelperManager(const G4HelperManager& other) = delete; static G4HelperManager& get_instance() { static G4HelperManager instance; return instance; } static void set_instance(int nb, int nk, int nw_pos, const std::vector<int>& delta_k_indices, const std::vector<int>& delta_w_indices, const int* add_k, int lda, const int* sub_k, int lds, int k0) { static std::once_flag flag; std::call_once(flag, [&]() { get_instance().set(nb, nk, nw_pos, delta_k_indices, delta_w_indices, add_k, lda, sub_k, lds, k0); }); } }; __host__ void G4Helper::set(int nb, int nk, int nw_pos, const std::vector<int>& delta_k, inline __host__ void G4Helper::set(int nb, int nk, int nw_pos, const std::vector<int>& delta_k, const std::vector<int>& delta_w, const int* add_k, int lda, const int* sub_k, int lds) { const int* sub_k, int lds, int k0) { if (isInitialized()) throw(std::logic_error("already initialized.")); Loading @@ -102,7 +123,7 @@ __host__ void G4Helper::set(int nb, int nk, int nw_pos, const std::vector<int>& for (const int idx : delta_w) ext_size = std::max(ext_size, std::abs(idx)); const std::array<int, 4> device_members_host{lda, lds, nw_pos, ext_size}; const std::array<int, 5> device_members_host{lda, lds, nw_pos, ext_size, k0}; cudaMalloc(&device_members_, sizeof(int) * device_members_host.size()); cudaMemcpy(device_members_, device_members_host.data(), sizeof(int) * device_members_host.size(), cudaMemcpyHostToDevice); Loading @@ -129,28 +150,34 @@ __host__ void G4Helper::set(int nb, int nk, int nw_pos, const std::vector<int>& cudaMemcpy(sbdm_steps_, steps_host.data(), sizeof(int) * steps_host.size(), cudaMemcpyHostToDevice); } __device__ int G4Helper::addWex(const int w_idx, const int w_ex_idx) const { inline __device__ int G4Helper::addWex(const int w_idx, const int w_ex_idx) const { return w_idx + w_ex_indices_[w_ex_idx]; } __device__ int G4Helper::wexMinus(const int w_idx, const int w_ex_idx) const { inline __device__ int G4Helper::wexMinus(const int w_idx, const int w_ex_idx) const { const int nw = 2 * device_members_[2]; return w_ex_indices_[w_ex_idx] + nw - 1 - w_idx; } __device__ int G4Helper::addKex(const int k_idx, const int k_ex_idx) const { inline __device__ int G4Helper::addKex(const int k_idx, const int k_ex_idx) const { const int ld = device_members_[0]; const int k_ex = k_ex_indices_[k_ex_idx]; return add_matrix_[k_idx + ld * k_ex]; } __device__ int G4Helper::kexMinus(const int k_idx, const int k_ex_idx) const { inline __device__ int G4Helper::kexMinus(const int k_idx, const int k_ex_idx) const { const int ld = device_members_[1]; const int k_ex = k_ex_indices_[k_ex_idx]; return sub_matrix_[k_idx + ld * k_ex]; } __device__ bool G4Helper::extendWIndices(int& w1, int& w2) const { inline __device__ int G4Helper::kMinus(const int k_idx) const { const int ld = device_members_[1]; const auto k0 = device_members_[4]; return sub_matrix_[k_idx + ld * k0]; } inline __device__ bool G4Helper::extendGIndices(int& k1, int& k2, int& w1, int& w2) const { const int extension = device_members_[3]; const int n_w_ext_pos = extension + device_members_[2]; w1 += extension; Loading @@ -162,24 +189,26 @@ __device__ bool G4Helper::extendWIndices(int& w1, int& w2) const { else { w1 = n_w_ext_pos - 1 - w1; w2 = 2 * n_w_ext_pos - 1 - w2; k1 = kMinus(k1); k2 = kMinus(k2); return true; } } __device__ int G4Helper::g4Index(int b1, int b2, int b3, int b4, int k1, int k2, int k_ex, int w1, int w2, int w_ex) const { inline __device__ int G4Helper::g4Index(int b1, int b2, int b3, int b4, int k1, int k2, int k_ex, int w1, int w2, int w_ex) const { return sbdm_steps_[0] * b1 + sbdm_steps_[1] * b2 + sbdm_steps_[2] * b3 + sbdm_steps_[3] * b4 + sbdm_steps_[4] * k1 + sbdm_steps_[5] * k2 + sbdm_steps_[6] * k_ex + sbdm_steps_[7] * w1 + sbdm_steps_[8] * w2 + sbdm_steps_[9] * w_ex; ; } __device__ int G4Helper::g4Index(int k1, int k2, int k_ex, int w1, int w2, int w_ex) const { inline __device__ int G4Helper::g4Index(int k1, int k2, int k_ex, int w1, int w2, int w_ex) const { return sbdm_steps_[4] * k1 + sbdm_steps_[5] * k2 + sbdm_steps_[6] * k_ex + sbdm_steps_[7] * w1 + sbdm_steps_[8] * w2 + sbdm_steps_[9] * w_ex; } __host__ G4HelperManager::~G4HelperManager() { inline __host__ G4HelperManager::~G4HelperManager() { cudaFree(G4Helper::add_matrix_); cudaFree(G4Helper::sub_matrix_); cudaFree(G4Helper::device_members_); Loading include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/kernels_interface.hpp +1 −1 Original line number Diff line number Diff line Loading @@ -39,7 +39,7 @@ void updateG4(std::complex<Real>* G4, const std::complex<Real>* G_up, int lggu, void initializeG4Helpers(int nb, int nk, int nw_pos, const std::vector<int>& delta_k_indices, const std::vector<int>& delta_w_indices, const int* add_k, int lda, const int* sub_k, int lds); const int* sub_k, int lds, int k0); } // details } // accumulator Loading include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator.hpp +13 −2 Original line number Diff line number Diff line Loading @@ -311,11 +311,16 @@ std::complex<typename TpAccumulator<Parameters, linalg::CPU>::Real> TpAccumulato auto minus_w1 = [=](const int w) { return n_pos_frqs_ - 1 - w; }; auto minus_w2 = [=](const int w) { return 2 * n_pos_frqs_ - 1 - w; }; auto plus_w1 = [=](const int w) { return w - n_pos_frqs_; }; auto minus_k = [=](const int k) { const static int k0 = RDmn::parameter_type::origin_index(); return RDmn::parameter_type::subtract(k, k0); }; if (w1_ext >= n_pos_frqs_) return G_(0, 0, s, k1, k2, plus_w1(w1_ext), w2_ext); else return std::conj(G_(0, 0, s, k1, k2, minus_w1(w1_ext), minus_w2(w2_ext))); return std::conj(G_(0, 0, s, minus_k(k1), minus_k(k2), minus_w1(w1_ext), minus_w2(w2_ext))); } template <class Parameters> Loading @@ -327,6 +332,12 @@ void TpAccumulator<Parameters, linalg::CPU>::getGMultiband(int s, int k1, int k2 auto minus_w2 = [=](const int w) { return 2 * n_pos_frqs_ - 1 - w; }; auto plus_w1 = [=](const int w) { return w - n_pos_frqs_; }; auto minus_k = [=](const int k) { const static int k0 = RDmn::parameter_type::origin_index(); return RDmn::parameter_type::subtract(k, k0); }; if (w1_ext >= n_pos_frqs_) { const Complex* const G_ptr = &G_(0, 0, s, k1, k2, plus_w1(w1_ext), w2_ext); for (int b2 = 0; b2 < n_bands_; ++b2) Loading @@ -334,7 +345,7 @@ void TpAccumulator<Parameters, linalg::CPU>::getGMultiband(int s, int k1, int k2 G(b1, b2) = beta * G(b1, b2) + G_ptr[b1 + b2 * n_bands_]; } else { const Complex* const G_ptr = &G_(0, 0, s, k1, k2, minus_w1(w1_ext), minus_w2(w2_ext)); const Complex* const G_ptr = &G_(0, 0, s, minus_k(k1), minus_k(k2), minus_w1(w1_ext), minus_w2(w2_ext)); for (int b2 = 0; b2 < n_bands_; ++b2) for (int b1 = 0; b1 < n_bands_; ++b1) G(b1, b2) = beta * G(b1, b2) + std::conj(G_ptr[b1 + b2 * n_bands_]); Loading include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_gpu.hpp +5 −3 Original line number Diff line number Diff line Loading @@ -25,6 +25,7 @@ #include "dca/linalg/util/magma_queue.hpp" #include "dca/math/function_transform/special_transforms/space_transform_2D_gpu.hpp" #include "dca/parallel/util/call_once_per_loop.hpp" #include "dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/g4_helper.cuh" #include "dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/kernels_interface.hpp" #include "dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/ndft/cached_ndft_gpu.hpp" Loading Loading @@ -218,11 +219,12 @@ void TpAccumulator<Parameters, linalg::GPU>::initializeG4Helpers() const { std::call_once(flag, []() { const auto& add_mat = KDmn::parameter_type::get_add_matrix(); const auto& sub_mat = KDmn::parameter_type::get_subtract_matrix(); const int k0 = KDmn::parameter_type::origin_index(); const auto& w_indices = domains::FrequencyExchangeDomain::get_elements(); const auto& q_indices = domains::MomentumExchangeDomain::get_elements(); details::initializeG4Helpers(n_bands_, KDmn::dmn_size(), WTpPosDmn::dmn_size(), q_indices, w_indices, add_mat.ptr(), add_mat.leadingDimension(), sub_mat.ptr(), sub_mat.leadingDimension()); details::G4HelperManager::set_instance( n_bands_, KDmn::dmn_size(), WTpPosDmn::dmn_size(), q_indices, w_indices, add_mat.ptr(), add_mat.leadingDimension(), sub_mat.ptr(), sub_mat.leadingDimension(), k0); assert(cudaPeekAtLastError() == cudaSuccess); }); } Loading Loading
include/dca/math/function_transform/special_transforms/space_transform_2D.hpp +4 −3 Original line number Diff line number Diff line Loading @@ -59,13 +59,14 @@ void SpaceTransform2D<RDmn, KDmn, Real>::execute( const int nc = RDmn::dmn_size(); const int nc2 = nc * nc; linalg::Matrix<Complex, linalg::CPU> tmp(nc); const Complex norm = Complex(1. / nc); const auto& T = get_T_matrix(); for (int l = 0; l < other_size; ++l) { linalg::MatrixView<Complex, linalg::CPU> f_r_r(&f_input(l * nc2), nc); // f(k1,k2) = \sum exp(i(k1 * r1 - k2 *r2)) f(r1, r2) linalg::matrixop::gemm(T, f_r_r, tmp); linalg::matrixop::gemm('N', 'C', Complex(1), tmp, T, Complex(0), f_r_r); linalg::matrixop::gemm('N', 'C', norm, tmp, T, Complex(0), f_r_r); } } Loading @@ -88,11 +89,11 @@ void SpaceTransform2D<RDmn, KDmn, Real>::execute( linalg::MatrixView<Complex, linalg::CPU> f_r_r(&f_input(0, 0, b1, b2, s, w1, w2), nc); // f(k1,k2) = \sum exp(i(k1 * r1 - k2 *r2)) f(r1, r2) linalg::matrixop::gemm(T, f_r_r, tmp); linalg::matrixop::gemm('N', 'C', Complex(1), tmp, T, Complex(0), f_r_r); linalg::matrixop::gemm('N', 'C', norm, tmp, T, Complex(0), f_r_r); for (int k2 = 0; k2 < nc; ++k2) for (int k1 = 0; k1 < nc; ++k1) f_output(b1, b2, s, k1, k2, w1, w2) = f_r_r(k1, k2) * norm; f_output(b1, b2, s, k1, k2, w1, w2) = f_r_r(k1, k2); } } Loading
include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/g4_helper.cuh +45 −16 Original line number Diff line number Diff line Loading @@ -9,14 +9,18 @@ // // Helper class for adding and subtracting momentum and frequency on the device. // TODO: split G4Helper and G4HelperManager into two separate file. #ifndef DCA_INCLUDE_DCA_PHYS_DCA_STEP_CLUSTER_SOLVER_SHARED_TOOLS_ACCUMULATION_TP_G4_HELPER_CUH #define DCA_INCLUDE_DCA_PHYS_DCA_STEP_CLUSTER_SOLVER_SHARED_TOOLS_ACCUMULATION_TP_G4_HELPER_CUH #include <array> #include <cuda.h> #include <mutex> #include <vector> #include <stdexcept> #include <cuda.h> namespace dca { namespace phys { namespace solver { Loading @@ -28,7 +32,7 @@ class G4Helper { public: __host__ void set(int nb, int nk, int nw_pos, const std::vector<int>& k_ex_indices, const std::vector<int>& w_ex_indices, const int* add_k, int lda, const int* sub_k, int lds); const int* sub_k, int lds, int k0); G4Helper(const G4Helper& other) = default; Loading @@ -40,6 +44,8 @@ public: __device__ inline int addKex(int k_idx, int k_ex_idx) const; // Returns the index of k_ex - k. __device__ inline int kexMinus(int k_idx, int k_ex_idx) const; // Returns the index of -k. __device__ inline int kMinus(int k_idx) const; // Returns the index of w + w_ex. __device__ inline int addWex(int w_idx, int w_ex_idx) const; // Returns the index of w_ex - w. Loading @@ -49,7 +55,7 @@ public: // to the extended (positive for w1) domain used by G. // In/Out: w1, w2. // Returns: true if G(w1, w2) is stored as a complex conjugate. __device__ inline bool extendWIndices(int& w1, int& w2) const; __device__ inline bool extendGIndices(int& k1, int& k2, int& w1, int& w2) const; // Returns the linear index of G4 as a function of // band, band, band, band, k1, k2, k_ex, w1, w2, w_ex. Loading @@ -71,6 +77,7 @@ protected: // lds: leading dimension of sub_matrix_. // nw_pos: number of positive frequencies stored in G4. // ext_size: difference between the number of positive frequencies stored in G and G4. // k0: index of the origin. int* device_members_ = nullptr; int* w_ex_indices_ = nullptr; Loading @@ -85,11 +92,25 @@ public: G4HelperManager() = default; __host__ ~G4HelperManager(); __host__ __device__ G4HelperManager(const G4HelperManager& other) = delete; static G4HelperManager& get_instance() { static G4HelperManager instance; return instance; } static void set_instance(int nb, int nk, int nw_pos, const std::vector<int>& delta_k_indices, const std::vector<int>& delta_w_indices, const int* add_k, int lda, const int* sub_k, int lds, int k0) { static std::once_flag flag; std::call_once(flag, [&]() { get_instance().set(nb, nk, nw_pos, delta_k_indices, delta_w_indices, add_k, lda, sub_k, lds, k0); }); } }; __host__ void G4Helper::set(int nb, int nk, int nw_pos, const std::vector<int>& delta_k, inline __host__ void G4Helper::set(int nb, int nk, int nw_pos, const std::vector<int>& delta_k, const std::vector<int>& delta_w, const int* add_k, int lda, const int* sub_k, int lds) { const int* sub_k, int lds, int k0) { if (isInitialized()) throw(std::logic_error("already initialized.")); Loading @@ -102,7 +123,7 @@ __host__ void G4Helper::set(int nb, int nk, int nw_pos, const std::vector<int>& for (const int idx : delta_w) ext_size = std::max(ext_size, std::abs(idx)); const std::array<int, 4> device_members_host{lda, lds, nw_pos, ext_size}; const std::array<int, 5> device_members_host{lda, lds, nw_pos, ext_size, k0}; cudaMalloc(&device_members_, sizeof(int) * device_members_host.size()); cudaMemcpy(device_members_, device_members_host.data(), sizeof(int) * device_members_host.size(), cudaMemcpyHostToDevice); Loading @@ -129,28 +150,34 @@ __host__ void G4Helper::set(int nb, int nk, int nw_pos, const std::vector<int>& cudaMemcpy(sbdm_steps_, steps_host.data(), sizeof(int) * steps_host.size(), cudaMemcpyHostToDevice); } __device__ int G4Helper::addWex(const int w_idx, const int w_ex_idx) const { inline __device__ int G4Helper::addWex(const int w_idx, const int w_ex_idx) const { return w_idx + w_ex_indices_[w_ex_idx]; } __device__ int G4Helper::wexMinus(const int w_idx, const int w_ex_idx) const { inline __device__ int G4Helper::wexMinus(const int w_idx, const int w_ex_idx) const { const int nw = 2 * device_members_[2]; return w_ex_indices_[w_ex_idx] + nw - 1 - w_idx; } __device__ int G4Helper::addKex(const int k_idx, const int k_ex_idx) const { inline __device__ int G4Helper::addKex(const int k_idx, const int k_ex_idx) const { const int ld = device_members_[0]; const int k_ex = k_ex_indices_[k_ex_idx]; return add_matrix_[k_idx + ld * k_ex]; } __device__ int G4Helper::kexMinus(const int k_idx, const int k_ex_idx) const { inline __device__ int G4Helper::kexMinus(const int k_idx, const int k_ex_idx) const { const int ld = device_members_[1]; const int k_ex = k_ex_indices_[k_ex_idx]; return sub_matrix_[k_idx + ld * k_ex]; } __device__ bool G4Helper::extendWIndices(int& w1, int& w2) const { inline __device__ int G4Helper::kMinus(const int k_idx) const { const int ld = device_members_[1]; const auto k0 = device_members_[4]; return sub_matrix_[k_idx + ld * k0]; } inline __device__ bool G4Helper::extendGIndices(int& k1, int& k2, int& w1, int& w2) const { const int extension = device_members_[3]; const int n_w_ext_pos = extension + device_members_[2]; w1 += extension; Loading @@ -162,24 +189,26 @@ __device__ bool G4Helper::extendWIndices(int& w1, int& w2) const { else { w1 = n_w_ext_pos - 1 - w1; w2 = 2 * n_w_ext_pos - 1 - w2; k1 = kMinus(k1); k2 = kMinus(k2); return true; } } __device__ int G4Helper::g4Index(int b1, int b2, int b3, int b4, int k1, int k2, int k_ex, int w1, int w2, int w_ex) const { inline __device__ int G4Helper::g4Index(int b1, int b2, int b3, int b4, int k1, int k2, int k_ex, int w1, int w2, int w_ex) const { return sbdm_steps_[0] * b1 + sbdm_steps_[1] * b2 + sbdm_steps_[2] * b3 + sbdm_steps_[3] * b4 + sbdm_steps_[4] * k1 + sbdm_steps_[5] * k2 + sbdm_steps_[6] * k_ex + sbdm_steps_[7] * w1 + sbdm_steps_[8] * w2 + sbdm_steps_[9] * w_ex; ; } __device__ int G4Helper::g4Index(int k1, int k2, int k_ex, int w1, int w2, int w_ex) const { inline __device__ int G4Helper::g4Index(int k1, int k2, int k_ex, int w1, int w2, int w_ex) const { return sbdm_steps_[4] * k1 + sbdm_steps_[5] * k2 + sbdm_steps_[6] * k_ex + sbdm_steps_[7] * w1 + sbdm_steps_[8] * w2 + sbdm_steps_[9] * w_ex; } __host__ G4HelperManager::~G4HelperManager() { inline __host__ G4HelperManager::~G4HelperManager() { cudaFree(G4Helper::add_matrix_); cudaFree(G4Helper::sub_matrix_); cudaFree(G4Helper::device_members_); Loading
include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/kernels_interface.hpp +1 −1 Original line number Diff line number Diff line Loading @@ -39,7 +39,7 @@ void updateG4(std::complex<Real>* G4, const std::complex<Real>* G_up, int lggu, void initializeG4Helpers(int nb, int nk, int nw_pos, const std::vector<int>& delta_k_indices, const std::vector<int>& delta_w_indices, const int* add_k, int lda, const int* sub_k, int lds); const int* sub_k, int lds, int k0); } // details } // accumulator Loading
include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator.hpp +13 −2 Original line number Diff line number Diff line Loading @@ -311,11 +311,16 @@ std::complex<typename TpAccumulator<Parameters, linalg::CPU>::Real> TpAccumulato auto minus_w1 = [=](const int w) { return n_pos_frqs_ - 1 - w; }; auto minus_w2 = [=](const int w) { return 2 * n_pos_frqs_ - 1 - w; }; auto plus_w1 = [=](const int w) { return w - n_pos_frqs_; }; auto minus_k = [=](const int k) { const static int k0 = RDmn::parameter_type::origin_index(); return RDmn::parameter_type::subtract(k, k0); }; if (w1_ext >= n_pos_frqs_) return G_(0, 0, s, k1, k2, plus_w1(w1_ext), w2_ext); else return std::conj(G_(0, 0, s, k1, k2, minus_w1(w1_ext), minus_w2(w2_ext))); return std::conj(G_(0, 0, s, minus_k(k1), minus_k(k2), minus_w1(w1_ext), minus_w2(w2_ext))); } template <class Parameters> Loading @@ -327,6 +332,12 @@ void TpAccumulator<Parameters, linalg::CPU>::getGMultiband(int s, int k1, int k2 auto minus_w2 = [=](const int w) { return 2 * n_pos_frqs_ - 1 - w; }; auto plus_w1 = [=](const int w) { return w - n_pos_frqs_; }; auto minus_k = [=](const int k) { const static int k0 = RDmn::parameter_type::origin_index(); return RDmn::parameter_type::subtract(k, k0); }; if (w1_ext >= n_pos_frqs_) { const Complex* const G_ptr = &G_(0, 0, s, k1, k2, plus_w1(w1_ext), w2_ext); for (int b2 = 0; b2 < n_bands_; ++b2) Loading @@ -334,7 +345,7 @@ void TpAccumulator<Parameters, linalg::CPU>::getGMultiband(int s, int k1, int k2 G(b1, b2) = beta * G(b1, b2) + G_ptr[b1 + b2 * n_bands_]; } else { const Complex* const G_ptr = &G_(0, 0, s, k1, k2, minus_w1(w1_ext), minus_w2(w2_ext)); const Complex* const G_ptr = &G_(0, 0, s, minus_k(k1), minus_k(k2), minus_w1(w1_ext), minus_w2(w2_ext)); for (int b2 = 0; b2 < n_bands_; ++b2) for (int b1 = 0; b1 < n_bands_; ++b1) G(b1, b2) = beta * G(b1, b2) + std::conj(G_ptr[b1 + b2 * n_bands_]); Loading
include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_gpu.hpp +5 −3 Original line number Diff line number Diff line Loading @@ -25,6 +25,7 @@ #include "dca/linalg/util/magma_queue.hpp" #include "dca/math/function_transform/special_transforms/space_transform_2D_gpu.hpp" #include "dca/parallel/util/call_once_per_loop.hpp" #include "dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/g4_helper.cuh" #include "dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/kernels_interface.hpp" #include "dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/ndft/cached_ndft_gpu.hpp" Loading Loading @@ -218,11 +219,12 @@ void TpAccumulator<Parameters, linalg::GPU>::initializeG4Helpers() const { std::call_once(flag, []() { const auto& add_mat = KDmn::parameter_type::get_add_matrix(); const auto& sub_mat = KDmn::parameter_type::get_subtract_matrix(); const int k0 = KDmn::parameter_type::origin_index(); const auto& w_indices = domains::FrequencyExchangeDomain::get_elements(); const auto& q_indices = domains::MomentumExchangeDomain::get_elements(); details::initializeG4Helpers(n_bands_, KDmn::dmn_size(), WTpPosDmn::dmn_size(), q_indices, w_indices, add_mat.ptr(), add_mat.leadingDimension(), sub_mat.ptr(), sub_mat.leadingDimension()); details::G4HelperManager::set_instance( n_bands_, KDmn::dmn_size(), WTpPosDmn::dmn_size(), q_indices, w_indices, add_mat.ptr(), add_mat.leadingDimension(), sub_mat.ptr(), sub_mat.leadingDimension(), k0); assert(cudaPeekAtLastError() == cudaSuccess); }); } Loading