Unverified Commit 06121c34 authored by Peter Doak's avatar Peter Doak Committed by GitHub
Browse files

Merge pull request #77 from CompFUSE/fix_76

Fix #76
parents 923e6015 0eb8af2b
Loading
Loading
Loading
Loading
+4 −3
Original line number Diff line number Diff line
@@ -59,13 +59,14 @@ void SpaceTransform2D<RDmn, KDmn, Real>::execute(
  const int nc = RDmn::dmn_size();
  const int nc2 = nc * nc;
  linalg::Matrix<Complex, linalg::CPU> tmp(nc);
  const Complex norm = Complex(1. / nc);
  const auto& T = get_T_matrix();

  for (int l = 0; l < other_size; ++l) {
    linalg::MatrixView<Complex, linalg::CPU> f_r_r(&f_input(l * nc2), nc);
    // f(k1,k2) = \sum exp(i(k1 * r1 - k2 *r2)) f(r1, r2)
    linalg::matrixop::gemm(T, f_r_r, tmp);
    linalg::matrixop::gemm('N', 'C', Complex(1), tmp, T, Complex(0), f_r_r);
    linalg::matrixop::gemm('N', 'C', norm, tmp, T, Complex(0), f_r_r);
  }
}

@@ -88,11 +89,11 @@ void SpaceTransform2D<RDmn, KDmn, Real>::execute(
            linalg::MatrixView<Complex, linalg::CPU> f_r_r(&f_input(0, 0, b1, b2, s, w1, w2), nc);
            // f(k1,k2) = \sum exp(i(k1 * r1 - k2 *r2)) f(r1, r2)
            linalg::matrixop::gemm(T, f_r_r, tmp);
            linalg::matrixop::gemm('N', 'C', Complex(1), tmp, T, Complex(0), f_r_r);
            linalg::matrixop::gemm('N', 'C', norm, tmp, T, Complex(0), f_r_r);

            for (int k2 = 0; k2 < nc; ++k2)
              for (int k1 = 0; k1 < nc; ++k1)
                f_output(b1, b2, s, k1, k2, w1, w2) = f_r_r(k1, k2) * norm;
                f_output(b1, b2, s, k1, k2, w1, w2) = f_r_r(k1, k2);
          }
}

+45 −16
Original line number Diff line number Diff line
@@ -9,14 +9,18 @@
//
// Helper class for adding and subtracting momentum and frequency on the device.

// TODO: split G4Helper and G4HelperManager into two separate file.

#ifndef DCA_INCLUDE_DCA_PHYS_DCA_STEP_CLUSTER_SOLVER_SHARED_TOOLS_ACCUMULATION_TP_G4_HELPER_CUH
#define DCA_INCLUDE_DCA_PHYS_DCA_STEP_CLUSTER_SOLVER_SHARED_TOOLS_ACCUMULATION_TP_G4_HELPER_CUH

#include <array>
#include <cuda.h>
#include <mutex>
#include <vector>
#include <stdexcept>

#include <cuda.h>

namespace dca {
namespace phys {
namespace solver {
@@ -28,7 +32,7 @@ class G4Helper {
public:
  __host__ void set(int nb, int nk, int nw_pos, const std::vector<int>& k_ex_indices,
                    const std::vector<int>& w_ex_indices, const int* add_k, int lda,
                    const int* sub_k, int lds);
                    const int* sub_k, int lds, int k0);

  G4Helper(const G4Helper& other) = default;

@@ -40,6 +44,8 @@ public:
  __device__ inline int addKex(int k_idx, int k_ex_idx) const;
  // Returns the index of k_ex - k.
  __device__ inline int kexMinus(int k_idx, int k_ex_idx) const;
  // Returns the index of -k.
  __device__ inline int kMinus(int k_idx) const;
  // Returns the index of w + w_ex.
  __device__ inline int addWex(int w_idx, int w_ex_idx) const;
  // Returns the index of w_ex - w.
@@ -49,7 +55,7 @@ public:
  // to the extended (positive for w1) domain used by G.
  // In/Out: w1, w2.
  // Returns: true if G(w1, w2) is stored as a complex conjugate.
  __device__ inline bool extendWIndices(int& w1, int& w2) const;
  __device__ inline bool extendGIndices(int& k1, int& k2, int& w1, int& w2) const;

  // Returns the linear index of G4 as a function of
  // band, band, band, band, k1, k2, k_ex, w1, w2, w_ex.
@@ -71,6 +77,7 @@ protected:
  // lds: leading dimension of sub_matrix_.
  // nw_pos: number of positive frequencies stored in G4.
  // ext_size: difference between the number of positive frequencies stored in G and G4.
  // k0: index of the origin.
  int* device_members_ = nullptr;

  int* w_ex_indices_ = nullptr;
@@ -85,11 +92,25 @@ public:
  G4HelperManager() = default;
  __host__ ~G4HelperManager();
  __host__ __device__ G4HelperManager(const G4HelperManager& other) = delete;

  static G4HelperManager& get_instance() {
    static G4HelperManager instance;
    return instance;
  }

  static void set_instance(int nb, int nk, int nw_pos, const std::vector<int>& delta_k_indices,
                           const std::vector<int>& delta_w_indices, const int* add_k, int lda,
                           const int* sub_k, int lds, int k0) {
    static std::once_flag flag;
    std::call_once(flag, [&]() {
      get_instance().set(nb, nk, nw_pos, delta_k_indices, delta_w_indices, add_k, lda, sub_k, lds, k0);
    });
  }
};

__host__ void G4Helper::set(int nb, int nk, int nw_pos, const std::vector<int>& delta_k,
inline __host__ void G4Helper::set(int nb, int nk, int nw_pos, const std::vector<int>& delta_k,
                                   const std::vector<int>& delta_w, const int* add_k, int lda,
                            const int* sub_k, int lds) {
                                   const int* sub_k, int lds, int k0) {
  if (isInitialized())
    throw(std::logic_error("already initialized."));

@@ -102,7 +123,7 @@ __host__ void G4Helper::set(int nb, int nk, int nw_pos, const std::vector<int>&
  for (const int idx : delta_w)
    ext_size = std::max(ext_size, std::abs(idx));

  const std::array<int, 4> device_members_host{lda, lds, nw_pos, ext_size};
  const std::array<int, 5> device_members_host{lda, lds, nw_pos, ext_size, k0};
  cudaMalloc(&device_members_, sizeof(int) * device_members_host.size());
  cudaMemcpy(device_members_, device_members_host.data(), sizeof(int) * device_members_host.size(),
             cudaMemcpyHostToDevice);
@@ -129,28 +150,34 @@ __host__ void G4Helper::set(int nb, int nk, int nw_pos, const std::vector<int>&
  cudaMemcpy(sbdm_steps_, steps_host.data(), sizeof(int) * steps_host.size(), cudaMemcpyHostToDevice);
}

__device__ int G4Helper::addWex(const int w_idx, const int w_ex_idx) const {
inline __device__ int G4Helper::addWex(const int w_idx, const int w_ex_idx) const {
  return w_idx + w_ex_indices_[w_ex_idx];
}

__device__ int G4Helper::wexMinus(const int w_idx, const int w_ex_idx) const {
inline __device__ int G4Helper::wexMinus(const int w_idx, const int w_ex_idx) const {
  const int nw = 2 * device_members_[2];
  return w_ex_indices_[w_ex_idx] + nw - 1 - w_idx;
}

__device__ int G4Helper::addKex(const int k_idx, const int k_ex_idx) const {
inline __device__ int G4Helper::addKex(const int k_idx, const int k_ex_idx) const {
  const int ld = device_members_[0];
  const int k_ex = k_ex_indices_[k_ex_idx];
  return add_matrix_[k_idx + ld * k_ex];
}

__device__ int G4Helper::kexMinus(const int k_idx, const int k_ex_idx) const {
inline __device__ int G4Helper::kexMinus(const int k_idx, const int k_ex_idx) const {
  const int ld = device_members_[1];
  const int k_ex = k_ex_indices_[k_ex_idx];
  return sub_matrix_[k_idx + ld * k_ex];
}

__device__ bool G4Helper::extendWIndices(int& w1, int& w2) const {
inline __device__ int G4Helper::kMinus(const int k_idx) const {
  const int ld = device_members_[1];
  const auto k0 = device_members_[4];
  return sub_matrix_[k_idx + ld * k0];
}

inline __device__ bool G4Helper::extendGIndices(int& k1, int& k2, int& w1, int& w2) const {
  const int extension = device_members_[3];
  const int n_w_ext_pos = extension + device_members_[2];
  w1 += extension;
@@ -162,24 +189,26 @@ __device__ bool G4Helper::extendWIndices(int& w1, int& w2) const {
  else {
    w1 = n_w_ext_pos - 1 - w1;
    w2 = 2 * n_w_ext_pos - 1 - w2;
    k1 = kMinus(k1);
    k2 = kMinus(k2);
    return true;
  }
}

__device__ int G4Helper::g4Index(int b1, int b2, int b3, int b4, int k1, int k2, int k_ex, int w1,
                                 int w2, int w_ex) const {
inline __device__ int G4Helper::g4Index(int b1, int b2, int b3, int b4, int k1, int k2, int k_ex,
                                        int w1, int w2, int w_ex) const {
  return sbdm_steps_[0] * b1 + sbdm_steps_[1] * b2 + sbdm_steps_[2] * b3 + sbdm_steps_[3] * b4 +
         sbdm_steps_[4] * k1 + sbdm_steps_[5] * k2 + sbdm_steps_[6] * k_ex + sbdm_steps_[7] * w1 +
         sbdm_steps_[8] * w2 + sbdm_steps_[9] * w_ex;
  ;
}

__device__ int G4Helper::g4Index(int k1, int k2, int k_ex, int w1, int w2, int w_ex) const {
inline __device__ int G4Helper::g4Index(int k1, int k2, int k_ex, int w1, int w2, int w_ex) const {
  return sbdm_steps_[4] * k1 + sbdm_steps_[5] * k2 + sbdm_steps_[6] * k_ex + sbdm_steps_[7] * w1 +
         sbdm_steps_[8] * w2 + sbdm_steps_[9] * w_ex;
}

__host__ G4HelperManager::~G4HelperManager() {
inline __host__ G4HelperManager::~G4HelperManager() {
  cudaFree(G4Helper::add_matrix_);
  cudaFree(G4Helper::sub_matrix_);
  cudaFree(G4Helper::device_members_);
+1 −1
Original line number Diff line number Diff line
@@ -39,7 +39,7 @@ void updateG4(std::complex<Real>* G4, const std::complex<Real>* G_up, int lggu,

void initializeG4Helpers(int nb, int nk, int nw_pos, const std::vector<int>& delta_k_indices,
                         const std::vector<int>& delta_w_indices, const int* add_k, int lda,
                         const int* sub_k, int lds);
                         const int* sub_k, int lds, int k0);

}  // details
}  // accumulator
+13 −2
Original line number Diff line number Diff line
@@ -311,11 +311,16 @@ std::complex<typename TpAccumulator<Parameters, linalg::CPU>::Real> TpAccumulato
  auto minus_w1 = [=](const int w) { return n_pos_frqs_ - 1 - w; };
  auto minus_w2 = [=](const int w) { return 2 * n_pos_frqs_ - 1 - w; };
  auto plus_w1 = [=](const int w) { return w - n_pos_frqs_; };
  auto minus_k = [=](const int k) {
    const static int k0 = RDmn::parameter_type::origin_index();
    return RDmn::parameter_type::subtract(k, k0);

  };

  if (w1_ext >= n_pos_frqs_)
    return G_(0, 0, s, k1, k2, plus_w1(w1_ext), w2_ext);
  else
    return std::conj(G_(0, 0, s, k1, k2, minus_w1(w1_ext), minus_w2(w2_ext)));
    return std::conj(G_(0, 0, s, minus_k(k1), minus_k(k2), minus_w1(w1_ext), minus_w2(w2_ext)));
}

template <class Parameters>
@@ -327,6 +332,12 @@ void TpAccumulator<Parameters, linalg::CPU>::getGMultiband(int s, int k1, int k2
  auto minus_w2 = [=](const int w) { return 2 * n_pos_frqs_ - 1 - w; };
  auto plus_w1 = [=](const int w) { return w - n_pos_frqs_; };

  auto minus_k = [=](const int k) {
    const static int k0 = RDmn::parameter_type::origin_index();
    return RDmn::parameter_type::subtract(k, k0);

  };

  if (w1_ext >= n_pos_frqs_) {
    const Complex* const G_ptr = &G_(0, 0, s, k1, k2, plus_w1(w1_ext), w2_ext);
    for (int b2 = 0; b2 < n_bands_; ++b2)
@@ -334,7 +345,7 @@ void TpAccumulator<Parameters, linalg::CPU>::getGMultiband(int s, int k1, int k2
        G(b1, b2) = beta * G(b1, b2) + G_ptr[b1 + b2 * n_bands_];
  }
  else {
    const Complex* const G_ptr = &G_(0, 0, s, k1, k2, minus_w1(w1_ext), minus_w2(w2_ext));
    const Complex* const G_ptr = &G_(0, 0, s, minus_k(k1), minus_k(k2), minus_w1(w1_ext), minus_w2(w2_ext));
    for (int b2 = 0; b2 < n_bands_; ++b2)
      for (int b1 = 0; b1 < n_bands_; ++b1)
        G(b1, b2) = beta * G(b1, b2) + std::conj(G_ptr[b1 + b2 * n_bands_]);
+5 −3
Original line number Diff line number Diff line
@@ -25,6 +25,7 @@
#include "dca/linalg/util/magma_queue.hpp"
#include "dca/math/function_transform/special_transforms/space_transform_2D_gpu.hpp"
#include "dca/parallel/util/call_once_per_loop.hpp"
#include "dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/g4_helper.cuh"
#include "dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/kernels_interface.hpp"
#include "dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/ndft/cached_ndft_gpu.hpp"

@@ -218,11 +219,12 @@ void TpAccumulator<Parameters, linalg::GPU>::initializeG4Helpers() const {
  std::call_once(flag, []() {
    const auto& add_mat = KDmn::parameter_type::get_add_matrix();
    const auto& sub_mat = KDmn::parameter_type::get_subtract_matrix();
    const int k0 = KDmn::parameter_type::origin_index();
    const auto& w_indices = domains::FrequencyExchangeDomain::get_elements();
    const auto& q_indices = domains::MomentumExchangeDomain::get_elements();
    details::initializeG4Helpers(n_bands_, KDmn::dmn_size(), WTpPosDmn::dmn_size(), q_indices,
                                 w_indices, add_mat.ptr(), add_mat.leadingDimension(),
                                 sub_mat.ptr(), sub_mat.leadingDimension());
    details::G4HelperManager::set_instance(
        n_bands_, KDmn::dmn_size(), WTpPosDmn::dmn_size(), q_indices, w_indices, add_mat.ptr(),
        add_mat.leadingDimension(), sub_mat.ptr(), sub_mat.leadingDimension(), k0);
    assert(cudaPeekAtLastError() == cudaSuccess);
  });
}
Loading