Commit 90f7ab89 authored by gbalduzz's avatar gbalduzz
Browse files

Removed a sync point and non pinned copies.

parent 10958ec9
Loading
Loading
Loading
Loading
+0 −14
Original line number Diff line number Diff line
@@ -193,11 +193,6 @@ public:
  // Swaps the contents of the matrix, included the name, with those of rhs.
  void swapWithName(Matrix<ScalarType, device_name>& rhs);

  // Asynchronous assignment (copy with stream = getStream(thread_id, stream_id))
  // + synchronization of stream
  template <DeviceType rhs_device_name>
  void set(const Matrix<ScalarType, rhs_device_name>& rhs, int thread_id, int stream_id);

#ifdef DCA_HAVE_CUDA
  // Asynchronous assignment.
  template <DeviceType rhs_device_name>
@@ -417,15 +412,6 @@ void Matrix<ScalarType, device_name>::swapWithName(Matrix<ScalarType, device_nam
  swap(rhs);
}

template <typename ScalarType, DeviceType device_name>
template <DeviceType rhs_device_name>
void Matrix<ScalarType, device_name>::set(const Matrix<ScalarType, rhs_device_name>& rhs,
                                          int thread_id, int stream_id) {
  resize(rhs.size_);
  util::memoryCopy(data_, leadingDimension(), rhs.data_, rhs.leadingDimension(), size_, thread_id,
                   stream_id);
}

#ifdef DCA_HAVE_CUDA

template <typename ScalarType, DeviceType device_name>
+6 −60
Original line number Diff line number Diff line
@@ -284,6 +284,8 @@ private:
  std::array<linalg::util::CudaEvent, 2> m_computed_events_;

  bool config_initialized_;

  linalg::util::CudaEvent sync_streams_event_;
};

template <dca::linalg::DeviceType device_t, class Parameters, class Data, typename Real>
@@ -586,18 +588,6 @@ void CtauxWalker<device_t, Parameters, Data, Real>::add_non_interacting_spins_to
    check_G0_matrices(configuration, G0_up, G0_dn);
#endif  // DCA_WITH_QMC_BIT
  }

  /*
    if(true)
    {
    std::cout << "\n\n\t G0-TOOLS \n\n";
    G0_CPU_tools_obj.build_G0_matrix(configuration, G0_up_CPU, e_UP);
    G0_CPU_tools_obj.build_G0_matrix(configuration, G0_dn_CPU, e_DN);
    dca::linalg::matrixop::difference(G0_up_CPU, G0_up);
    dca::linalg::matrixop::difference(G0_dn_CPU, G0_dn);
    }
  */

  {  // update N for new shuffled vertices
     // profiler_type profiler("N-matrix (update)", "CT-AUX walker", __LINE__, thread_id);

@@ -609,17 +599,6 @@ void CtauxWalker<device_t, Parameters, Data, Real>::add_non_interacting_spins_to
#endif  // DCA_WITH_QMC_BIT
  }

  /*
    if(true)
    {
    std::cout << "\n\n\t N-TOOLS : " << sign << "\t" << configuration.size() << "\n\n";
    N_CPU_tools_obj.build_N_matrix(configuration, N_up_CPU, G0_up_CPU, e_UP);
    N_CPU_tools_obj.build_N_matrix(configuration, N_dn_CPU, G0_dn_CPU, e_DN);
    dca::linalg::matrixop::difference(N_up_CPU, N_up);
    dca::linalg::matrixop::difference(N_dn_CPU, N_dn);
    }
  */

  {  // update N for new shuffled vertices
     // profiler_type profiler("G-matrix (update)", "CT-AUX walker", __LINE__, thread_id);

@@ -630,26 +609,6 @@ void CtauxWalker<device_t, Parameters, Data, Real>::add_non_interacting_spins_to
    check_G_matrices(configuration, G0_up, G0_dn, N_up, N_dn, G_up, G_dn);
#endif  // DCA_WITH_QMC_BIT
  }

  /*
    {
    std::cout << "\n\n\t G-TOOLS\n\n";
    G_CPU_tools_obj.build_G_matrix(configuration, N_up_CPU, G0_up_CPU, G_up_CPU, e_UP);
    G_CPU_tools_obj.build_G_matrix(configuration, N_dn_CPU, G0_dn_CPU, G_dn_CPU, e_DN);
    dca::linalg::matrixop::difference(G_up_CPU, G_up);
    dca::linalg::matrixop::difference(G_dn_CPU, G_dn);
    }
  */

  /*
#ifdef DCA_WITH_QMC_BIT
    if(concurrency.id()==0 and thread_id==0)
    std::cout << "\t N-update check :" << std::endl;

    N_tools_obj.check_N_matrix(configuration, N_up, G0_up, Gamma_up, e_UP);
    N_tools_obj.check_N_matrix(configuration, N_dn, G0_dn, Gamma_dn, e_DN);
#endif  // DCA_WITH_QMC_BIT
  */
}

template <dca::linalg::DeviceType device_t, class Parameters, class Data, typename Real>
@@ -1427,23 +1386,6 @@ void CtauxWalker<device_t, Parameters, Data, Real>::clean_up_the_configuration()
  SHRINK_tools_obj.reorganize_configuration_test(configuration, N_up, N_dn, G0_up, G0_dn);

  assert(configuration.assert_consistency());

  // #ifdef DCA_WITH_QMC_BIT
  //   check_N_matrices(configuration, G0_up, G0_dn, N_up, N_dn);

  //   if (concurrency.id() == concurrency.first()) {
  //     std::cout << "\t\t <k>               = " <<
  //     configuration.get_number_of_interacting_HS_spins()
  //               << std::endl;
  //     std::cout << "\t\t # creatable spins = " <<
  //     configuration.get_number_of_creatable_HS_spins()
  //               << std::endl;
  //     std::cout << "\t N-woodburry check (2) :" << std::endl;
  //   }

  //   N_tools_obj.check_N_matrix(configuration, N_up, G0_up, Gamma_up, e_UP);
  //   N_tools_obj.check_N_matrix(configuration, N_dn, G0_dn, Gamma_dn, e_DN);
  // #endif  // DCA_WITH_QMC_BIT
}

template <dca::linalg::DeviceType device_t, class Parameters, class Data, typename Real>
@@ -1560,6 +1502,10 @@ template <dca::linalg::DeviceType device_t, class Parameters, class Data, typena
template <typename AccumType>
const linalg::util::CudaEvent* CtauxWalker<device_t, Parameters, Data, Real>::compute_M(
    std::array<linalg::Matrix<AccumType, device_t>, 2>& Ms) {
  // Stream 1 waits on stream 0.
  sync_streams_event_.record(linalg::util::getStream(thread_id, 0));
  sync_streams_event_.block(linalg::util::getStream(thread_id, 1));

  for (int s = 0; s < 2; ++s) {
    const auto& config = get_configuration().get(s == 0 ? e_UP : e_DN);
    exp_v_minus_one_[s].resizeNoCopy(config.size());
+10 −24
Original line number Diff line number Diff line
@@ -20,6 +20,7 @@
#include <vector>

#include "dca/io/buffer.hpp"
#include "dca/linalg/util/allocators/vectors_typedefs.hpp"
#include "dca/phys/dca_step/cluster_solver/ctaux/domains/hs_field_sign_domain.hpp"
#include "dca/phys/dca_step/cluster_solver/ctaux/domains/hs_spin_domain.hpp"
#include "dca/phys/dca_step/cluster_solver/ctaux/structs/vertex_pair.hpp"
@@ -70,7 +71,8 @@ public:
  std::vector<int>& get_changed_spin_indices();
  std::vector<HS_spin_states_type>& get_changed_spin_values();

  std::vector<int>& get_changed_spin_indices_e_spin(e_spin_states_type e_spin_type);
  auto& get_changed_spin_indices_e_spin(e_spin_states_type e_spin_type);

  std::vector<HS_spin_states_type>& get_changed_spin_values_e_spin(e_spin_states_type e_spin_type);

  int get_number_of_interacting_HS_spins();
@@ -127,10 +129,11 @@ private:
  std::vector<int> changed_spin_indices;
  std::vector<HS_spin_states_type> changed_spin_values;

  std::vector<int> changed_spin_indices_e_UP;  // = { changed_spin_indices of configuration_e_UP}
  std::vector<HS_spin_states_type> changed_spin_values_e_UP;
  using HostVector = linalg::util::HostVector<int>;
  HostVector changed_spin_indices_e_UP;  // = { changed_spin_indices of configuration_e_UP}
  HostVector changed_spin_indices_e_DN;  // = { changed_spin_indices of configuration_e_DN}

  std::vector<int> changed_spin_indices_e_DN;  // = { changed_spin_indices of configuration_e_DN}
  std::vector<HS_spin_states_type> changed_spin_values_e_UP;
  std::vector<HS_spin_states_type> changed_spin_values_e_DN;

  const int max_num_noninteracting_spins_;
@@ -144,23 +147,6 @@ CT_AUX_HS_configuration<parameters_type>::CT_AUX_HS_configuration(parameters_typ
    : parameters(parameters_ref),
      rng(rng_ref),

      configuration(),

      configuration_e_UP(0),
      configuration_e_DN(0),

      current_Nb_of_creatable_spins(0),
      current_Nb_of_annihilatable_spins(0),

      changed_spin_indices(0),
      changed_spin_values(0),

      changed_spin_indices_e_UP(0),
      changed_spin_values_e_UP(0),

      changed_spin_indices_e_DN(0),
      changed_spin_values_e_DN(0),

      // Rounding up ensures a value >= 1.
      max_num_noninteracting_spins_((parameters.get_max_submatrix_size() + 1) / 2),

@@ -426,12 +412,12 @@ void CT_AUX_HS_configuration<parameters_type>::add_delayed_HS_spin(int configura
template <class parameters_type>
void CT_AUX_HS_configuration<parameters_type>::add_delayed_HS_spin_to_configuration_e_spin(
    int configuration_index, HS_spin_states_type spin_value) {
  std::vector<int>& changed_spin_indices_e_spin_first =
  auto& changed_spin_indices_e_spin_first =
      get_changed_spin_indices_e_spin(configuration[configuration_index].get_e_spins().first);
  std::vector<HS_spin_states_type>& changed_spin_values_e_spin_first =
      get_changed_spin_values_e_spin(configuration[configuration_index].get_e_spins().first);

  std::vector<int>& changed_spin_indices_e_spin_second =
  auto& changed_spin_indices_e_spin_second =
      get_changed_spin_indices_e_spin(configuration[configuration_index].get_e_spins().second);
  std::vector<HS_spin_states_type>& changed_spin_values_e_spin_second =
      get_changed_spin_values_e_spin(configuration[configuration_index].get_e_spins().second);
@@ -473,7 +459,7 @@ std::vector<HS_spin_states_type>& CT_AUX_HS_configuration<parameters_type>::get_
}

template <class parameters_type>
std::vector<int>& CT_AUX_HS_configuration<parameters_type>::get_changed_spin_indices_e_spin(
auto& CT_AUX_HS_configuration<parameters_type>::get_changed_spin_indices_e_spin(
    e_spin_states_type e_spin) {
  if (e_spin == e_UP)
    return changed_spin_indices_e_UP;
+9 −6
Original line number Diff line number Diff line
@@ -17,13 +17,16 @@ class N_MATRIX_TOOLS<dca::linalg::CPU,Parameters, Real> {
  typedef typename Parameters::concurrency_type concurrency_type;
  typedef typename Parameters::profiler_type profiler_t;

  template<class T>
  using HostVector = linalg::util::HostVector<T>;

public:
  N_MATRIX_TOOLS(int id,Parameters& parameters_ref);

  Real* get_device_ptr(dca::linalg::Vector<Real, dca::linalg::CPU>& v);

  int* get_permutation();
  void set_permutation(std::vector<int>& p);
  const int* get_permutation() const;
  void set_permutation(const HostVector<int>& p);

  void set_d_vector(dca::linalg::Vector<Real, dca::linalg::CPU>& d_inv);

@@ -32,7 +35,7 @@ public:
  void copy_rows(dca::linalg::Matrix<Real, dca::linalg::CPU>& N,
                 dca::linalg::Matrix<Real, dca::linalg::CPU>& N_new_spins);

  void compute_G_cols(std::vector<Real>& exp_V, dca::linalg::Matrix<Real, dca::linalg::CPU>& N,
  void compute_G_cols(HostVector<Real>& exp_V, dca::linalg::Matrix<Real, dca::linalg::CPU>& N,
                      dca::linalg::Matrix<Real, dca::linalg::CPU>& G,
                      dca::linalg::Matrix<Real, dca::linalg::CPU>& G_cols);

@@ -77,12 +80,12 @@ Real* N_MATRIX_TOOLS<dca::linalg::CPU,Parameters, Real>::get_device_ptr(
}

 template <class Parameters, typename Real>
int* N_MATRIX_TOOLS<dca::linalg::CPU,Parameters, Real>::get_permutation() {
const int* N_MATRIX_TOOLS<dca::linalg::CPU,Parameters, Real>::get_permutation() const {
  return permutation.ptr();
}

 template <class Parameters, typename Real>
void N_MATRIX_TOOLS<dca::linalg::CPU,Parameters, Real>::set_permutation(std::vector<int>& p) {
void N_MATRIX_TOOLS<dca::linalg::CPU,Parameters, Real>::set_permutation(const HostVector<int>& p) {
  permutation = p;
}

@@ -113,7 +116,7 @@ void N_MATRIX_TOOLS<dca::linalg::CPU,Parameters, Real>::copy_rows(

 template <class Parameters, typename Real>
void N_MATRIX_TOOLS<dca::linalg::CPU,Parameters, Real>::compute_G_cols(
    std::vector<Real>& exp_V, dca::linalg::Matrix<Real, dca::linalg::CPU>& N,
    HostVector<Real>& exp_V, dca::linalg::Matrix<Real, dca::linalg::CPU>& N,
    dca::linalg::Matrix<Real, dca::linalg::CPU>& G,
    dca::linalg::Matrix<Real, dca::linalg::CPU>& G_cols) {
  assert(N.nrRows() == G.nrRows());
+11 −8
Original line number Diff line number Diff line
@@ -17,11 +17,14 @@ class N_MATRIX_TOOLS<dca::linalg::GPU, Parameters, Real> {
  typedef typename Parameters::concurrency_type concurrency_type;
  typedef typename Parameters::profiler_type profiler_t;

  template<class T>
  using HostVector = linalg::util::HostVector<T>;

public:
  N_MATRIX_TOOLS(int id, Parameters& parameters_ref);

  int* get_permutation();
  void set_permutation(std::vector<int>& p);
  const int* get_permutation() const;
  void set_permutation(const HostVector<int>& p);

  void set_d_vector(dca::linalg::Vector<Real, dca::linalg::CPU>& d_inv);

@@ -32,7 +35,7 @@ public:
  void copy_rows(dca::linalg::Matrix<Real, dca::linalg::GPU>& N,
                 dca::linalg::Matrix<Real, dca::linalg::GPU>& N_new_spins);

  void compute_G_cols(std::vector<Real>& exp_V, dca::linalg::Matrix<Real, dca::linalg::GPU>& N,
  void compute_G_cols(HostVector <Real> &exp_V, dca::linalg::Matrix<Real, dca::linalg::GPU>& N,
                      dca::linalg::Matrix<Real, dca::linalg::GPU>& G,
                      dca::linalg::Matrix<Real, dca::linalg::GPU>& G_cols);

@@ -85,12 +88,12 @@ N_MATRIX_TOOLS<dca::linalg::GPU, Parameters, Real>::N_MATRIX_TOOLS(int id, Param
}

template <class Parameters, typename Real>
int* N_MATRIX_TOOLS<dca::linalg::GPU, Parameters, Real>::get_permutation() {
const int* N_MATRIX_TOOLS<dca::linalg::GPU, Parameters, Real>::get_permutation() const {
  return permutation.ptr();
}

template <class Parameters, typename Real>
void N_MATRIX_TOOLS<dca::linalg::GPU, Parameters, Real>::set_permutation(std::vector<int>& p) {
void N_MATRIX_TOOLS<dca::linalg::GPU, Parameters, Real>::set_permutation(const HostVector<int>& p) {
  permutation.setAsync(p, thread_id, stream_id);
}

@@ -129,7 +132,7 @@ void N_MATRIX_TOOLS<dca::linalg::GPU, Parameters, Real>::copy_rows(

template <class Parameters, typename Real>
void N_MATRIX_TOOLS<dca::linalg::GPU, Parameters, Real>::compute_G_cols(
    std::vector<Real>& exp_V_CPU, dca::linalg::Matrix<Real, dca::linalg::GPU>& N,
        HostVector<Real> &exp_V_CPU, dca::linalg::Matrix<Real, dca::linalg::GPU>& N,
        dca::linalg::Matrix<Real, dca::linalg::GPU>& G,
        dca::linalg::Matrix<Real, dca::linalg::GPU>& G_cols) {
  exp_V.setAsync(exp_V_CPU, linalg::util::getStream(thread_id, stream_id));
Loading