Merge pull request #168 from gbalduzz/fix_ctaux_race_condition (ec228d9a) · Commits · NDIP / Tool Sources / Direct-Geometry Spectroscopy / DCA / DCA Main

CMakeLists.txt

+1 −2

Original line number	Diff line number	Diff line
		@@ -144,11 +144,10 @@ set(DCA_LIBS
		${DCA_CONCURRENCY_LIB}
		${DCA_THREADING_LIBS}
		lapack
		cuda_utils
		)

		if (DCA_HAVE_CUDA)
		list(APPEND DCA_CUDA_LIBS
		cuda_utils)
		list(APPEND DCA_LIBS
		blas_kernels
		dnfft_kernels

include/dca/linalg/util/util_cublas.hpp

+0 −5

Original line number	Diff line number	Diff line
		@@ -12,11 +12,6 @@
		#ifndef DCA_LINALG_UTIL_UTIL_CUBLAS_HPP
		#define DCA_LINALG_UTIL_UTIL_CUBLAS_HPP

		#include <cublas_v2.h>
		#include <stdexcept>
		#include <string>
		#include "dca/linalg/util/error_cuda.hpp"

		namespace dca {
		namespace linalg {
		namespace util {

include/dca/linalg/vector.hpp

+1 −0

Original line number	Diff line number	Diff line
		@@ -310,6 +310,7 @@ void Vector<ScalarType, device_name, Allocator>::setAsync(const Container& rhs,
		const cudaStream_t stream) {
		resizeNoCopy(rhs.size());
		util::memoryCopyAsync(data_, rhs.data(), size_, stream);
		// cudaDeviceSynchronize();
		}

		template <typename ScalarType, DeviceType device_name, class Allocator>

include/dca/phys/dca_step/cluster_solver/ctaux/walker/tools/g0_interpolation/g0_interpolation.hpp

+1 −0

Original line number	Diff line number	Diff line
		@@ -27,6 +27,7 @@
		#include "dca/phys/domains/time_and_frequency/time_domain_left_oriented.hpp"

		#ifdef DCA_HAVE_CUDA
		#include "dca/linalg/util/cuda_event.hpp"
		#include "dca/phys/dca_step/cluster_solver/ctaux/walker/tools/g0_interpolation/g0_interpolation_kernels.hpp"
		#endif

include/dca/phys/dca_step/cluster_solver/ctaux/walker/tools/g0_interpolation/g0_interpolation_gpu.inc

+8 −1

Original line number	Diff line number	Diff line
		@@ -6,6 +6,7 @@
		// See CITATION.md for citation guidelines, if DCA++ is used for scientific publications.
		//
		// Author: Peter Staar (taa@zurich.ibm.com)
		// Giovanni Balduzzi (gbalduzz@itp.phys.eth.ch)
		//
		// This class organizes the interpolation of \f$G^{0}\f$ towards the \f$G^{0}\f$-matrix.
		// Template specialization for GPU.
		@@ -80,6 +81,8 @@ private:
		dca::linalg::Vector<double, dca::linalg::GPU> tau_GPU;

		using G0_INTERPOLATION_TEMPLATE<parameters_type>::beta;

		linalg::util::CudaEvent config_copied_;
		};

		template <typename parameters_type>
		@@ -172,6 +175,7 @@ void G0_INTERPOLATION<dca::linalg::GPU, parameters_type>::build_G0_matrix(

		G0_e_spin.resizeNoCopy(configuration_size);

		config_copied_.block();
		b_ind.resize(configuration_size);
		r_ind.resize(configuration_size);
		tau.resize(configuration_size);
		@@ -212,6 +216,7 @@ void G0_INTERPOLATION<dca::linalg::GPU, parameters_type>::update_G0_matrix(

		int first_shuffled_index = configuration.get_first_shuffled_spin_index(e_spin);

		config_copied_.block();
		b_ind.resize(configuration_size);
		r_ind.resize(configuration_size);
		tau.resize(configuration_size);
		@@ -222,10 +227,12 @@ void G0_INTERPOLATION<dca::linalg::GPU, parameters_type>::update_G0_matrix(
		tau[l] = configuration_e_spin[l].get_tau();
		}

		auto stream = linalg::util::getStream(thread_id, stream_id);
		cudaStream_t stream = linalg::util::getStream(thread_id, stream_id);
		// TODO: create generic container for AoS.
		b_ind_GPU.setAsync(b_ind, stream);
		r_ind_GPU.setAsync(r_ind, stream);
		tau_GPU.setAsync(tau, stream);
		config_copied_.record(stream);

		g0kernels::akima_interpolation_on_GPU(
		Nb, Nr, Nt, beta, first_shuffled_index, configuration_size, b_ind_GPU.ptr(), r_ind_GPU.ptr(),