Loading include/dca/linalg/vector.hpp +1 −0 Original line number Diff line number Diff line Loading @@ -310,6 +310,7 @@ void Vector<ScalarType, device_name, Allocator>::setAsync(const Container& rhs, const cudaStream_t stream) { resizeNoCopy(rhs.size()); util::memoryCopyAsync(data_, rhs.data(), size_, stream); // cudaDeviceSynchronize(); } template <typename ScalarType, DeviceType device_name, class Allocator> Loading include/dca/phys/dca_step/cluster_solver/ctaux/walker/tools/g0_interpolation/g0_interpolation.hpp +1 −0 Original line number Diff line number Diff line Loading @@ -27,6 +27,7 @@ #include "dca/phys/domains/time_and_frequency/time_domain_left_oriented.hpp" #ifdef DCA_HAVE_CUDA #include "dca/linalg/util/cuda_event.hpp" #include "dca/phys/dca_step/cluster_solver/ctaux/walker/tools/g0_interpolation/g0_interpolation_kernels.hpp" #endif Loading include/dca/phys/dca_step/cluster_solver/ctaux/walker/tools/g0_interpolation/g0_interpolation_gpu.inc +8 −1 Original line number Diff line number Diff line Loading @@ -6,6 +6,7 @@ // See CITATION.md for citation guidelines, if DCA++ is used for scientific publications. // // Author: Peter Staar (taa@zurich.ibm.com) // Giovanni Balduzzi (gbalduzz@itp.phys.eth.ch) // // This class organizes the interpolation of \f$G^{0}\f$ towards the \f$G^{0}\f$-matrix. // Template specialization for GPU. Loading Loading @@ -80,6 +81,8 @@ private: dca::linalg::Vector<double, dca::linalg::GPU> tau_GPU; using G0_INTERPOLATION_TEMPLATE<parameters_type>::beta; linalg::util::CudaEvent config_copied_; }; template <typename parameters_type> Loading Loading @@ -172,6 +175,7 @@ void G0_INTERPOLATION<dca::linalg::GPU, parameters_type>::build_G0_matrix( G0_e_spin.resizeNoCopy(configuration_size); config_copied_.block(); b_ind.resize(configuration_size); r_ind.resize(configuration_size); tau.resize(configuration_size); Loading Loading @@ -212,6 +216,7 @@ void G0_INTERPOLATION<dca::linalg::GPU, parameters_type>::update_G0_matrix( int first_shuffled_index = configuration.get_first_shuffled_spin_index(e_spin); config_copied_.block(); b_ind.resize(configuration_size); r_ind.resize(configuration_size); tau.resize(configuration_size); Loading @@ -222,10 +227,12 @@ void G0_INTERPOLATION<dca::linalg::GPU, parameters_type>::update_G0_matrix( tau[l] = configuration_e_spin[l].get_tau(); } auto stream = linalg::util::getStream(thread_id, stream_id); cudaStream_t stream = linalg::util::getStream(thread_id, stream_id); // TODO: create generic container for AoS. b_ind_GPU.setAsync(b_ind, stream); r_ind_GPU.setAsync(r_ind, stream); tau_GPU.setAsync(tau, stream); config_copied_.record(stream); g0kernels::akima_interpolation_on_GPU( Nb, Nr, Nt, beta, first_shuffled_index, configuration_size, b_ind_GPU.ptr(), r_ind_GPU.ptr(), Loading Loading
include/dca/linalg/vector.hpp +1 −0 Original line number Diff line number Diff line Loading @@ -310,6 +310,7 @@ void Vector<ScalarType, device_name, Allocator>::setAsync(const Container& rhs, const cudaStream_t stream) { resizeNoCopy(rhs.size()); util::memoryCopyAsync(data_, rhs.data(), size_, stream); // cudaDeviceSynchronize(); } template <typename ScalarType, DeviceType device_name, class Allocator> Loading
include/dca/phys/dca_step/cluster_solver/ctaux/walker/tools/g0_interpolation/g0_interpolation.hpp +1 −0 Original line number Diff line number Diff line Loading @@ -27,6 +27,7 @@ #include "dca/phys/domains/time_and_frequency/time_domain_left_oriented.hpp" #ifdef DCA_HAVE_CUDA #include "dca/linalg/util/cuda_event.hpp" #include "dca/phys/dca_step/cluster_solver/ctaux/walker/tools/g0_interpolation/g0_interpolation_kernels.hpp" #endif Loading
include/dca/phys/dca_step/cluster_solver/ctaux/walker/tools/g0_interpolation/g0_interpolation_gpu.inc +8 −1 Original line number Diff line number Diff line Loading @@ -6,6 +6,7 @@ // See CITATION.md for citation guidelines, if DCA++ is used for scientific publications. // // Author: Peter Staar (taa@zurich.ibm.com) // Giovanni Balduzzi (gbalduzz@itp.phys.eth.ch) // // This class organizes the interpolation of \f$G^{0}\f$ towards the \f$G^{0}\f$-matrix. // Template specialization for GPU. Loading Loading @@ -80,6 +81,8 @@ private: dca::linalg::Vector<double, dca::linalg::GPU> tau_GPU; using G0_INTERPOLATION_TEMPLATE<parameters_type>::beta; linalg::util::CudaEvent config_copied_; }; template <typename parameters_type> Loading Loading @@ -172,6 +175,7 @@ void G0_INTERPOLATION<dca::linalg::GPU, parameters_type>::build_G0_matrix( G0_e_spin.resizeNoCopy(configuration_size); config_copied_.block(); b_ind.resize(configuration_size); r_ind.resize(configuration_size); tau.resize(configuration_size); Loading Loading @@ -212,6 +216,7 @@ void G0_INTERPOLATION<dca::linalg::GPU, parameters_type>::update_G0_matrix( int first_shuffled_index = configuration.get_first_shuffled_spin_index(e_spin); config_copied_.block(); b_ind.resize(configuration_size); r_ind.resize(configuration_size); tau.resize(configuration_size); Loading @@ -222,10 +227,12 @@ void G0_INTERPOLATION<dca::linalg::GPU, parameters_type>::update_G0_matrix( tau[l] = configuration_e_spin[l].get_tau(); } auto stream = linalg::util::getStream(thread_id, stream_id); cudaStream_t stream = linalg::util::getStream(thread_id, stream_id); // TODO: create generic container for AoS. b_ind_GPU.setAsync(b_ind, stream); r_ind_GPU.setAsync(r_ind, stream); tau_GPU.setAsync(tau, stream); config_copied_.record(stream); g0kernels::akima_interpolation_on_GPU( Nb, Nr, Nt, beta, first_shuffled_index, configuration_size, b_ind_GPU.ptr(), r_ind_GPU.ptr(), Loading