Removed a sync point and non pinned copies. (90f7ab89) · Commits · NDIP / Tool Sources / Direct-Geometry Spectroscopy / DCA / DCA Main

include/dca/linalg/matrix.hpp

+0 −14

Original line number	Diff line number	Diff line
		@@ -193,11 +193,6 @@ public:
		// Swaps the contents of the matrix, included the name, with those of rhs.
		void swapWithName(Matrix<ScalarType, device_name>& rhs);

		// Asynchronous assignment (copy with stream = getStream(thread_id, stream_id))
		// + synchronization of stream
		template <DeviceType rhs_device_name>
		void set(const Matrix<ScalarType, rhs_device_name>& rhs, int thread_id, int stream_id);

		#ifdef DCA_HAVE_CUDA
		// Asynchronous assignment.
		template <DeviceType rhs_device_name>
		@@ -417,15 +412,6 @@ void Matrix<ScalarType, device_name>::swapWithName(Matrix<ScalarType, device_nam
		swap(rhs);
		}

		template <typename ScalarType, DeviceType device_name>
		template <DeviceType rhs_device_name>
		void Matrix<ScalarType, device_name>::set(const Matrix<ScalarType, rhs_device_name>& rhs,
		int thread_id, int stream_id) {
		resize(rhs.size_);
		util::memoryCopy(data_, leadingDimension(), rhs.data_, rhs.leadingDimension(), size_, thread_id,
		stream_id);
		}

		#ifdef DCA_HAVE_CUDA

		template <typename ScalarType, DeviceType device_name>

include/dca/phys/dca_step/cluster_solver/ctaux/ctaux_walker.hpp

+6 −60

Original line number	Diff line number	Diff line
		@@ -284,6 +284,8 @@ private:
		std::array<linalg::util::CudaEvent, 2> m_computed_events_;

		bool config_initialized_;

		linalg::util::CudaEvent sync_streams_event_;
		};

		template <dca::linalg::DeviceType device_t, class Parameters, class Data, typename Real>
		@@ -586,18 +588,6 @@ void CtauxWalker<device_t, Parameters, Data, Real>::add_non_interacting_spins_to
		check_G0_matrices(configuration, G0_up, G0_dn);
		#endif // DCA_WITH_QMC_BIT
		}

		/*
		if(true)
		{
		std::cout << "\n\n\t G0-TOOLS \n\n";
		G0_CPU_tools_obj.build_G0_matrix(configuration, G0_up_CPU, e_UP);
		G0_CPU_tools_obj.build_G0_matrix(configuration, G0_dn_CPU, e_DN);
		dca::linalg::matrixop::difference(G0_up_CPU, G0_up);
		dca::linalg::matrixop::difference(G0_dn_CPU, G0_dn);
		}
		*/

		{ // update N for new shuffled vertices
		// profiler_type profiler("N-matrix (update)", "CT-AUX walker", __LINE__, thread_id);

		@@ -609,17 +599,6 @@ void CtauxWalker<device_t, Parameters, Data, Real>::add_non_interacting_spins_to
		#endif // DCA_WITH_QMC_BIT
		}

		/*
		if(true)
		{
		std::cout << "\n\n\t N-TOOLS : " << sign << "\t" << configuration.size() << "\n\n";
		N_CPU_tools_obj.build_N_matrix(configuration, N_up_CPU, G0_up_CPU, e_UP);
		N_CPU_tools_obj.build_N_matrix(configuration, N_dn_CPU, G0_dn_CPU, e_DN);
		dca::linalg::matrixop::difference(N_up_CPU, N_up);
		dca::linalg::matrixop::difference(N_dn_CPU, N_dn);
		}
		*/

		{ // update N for new shuffled vertices
		// profiler_type profiler("G-matrix (update)", "CT-AUX walker", __LINE__, thread_id);

		@@ -630,26 +609,6 @@ void CtauxWalker<device_t, Parameters, Data, Real>::add_non_interacting_spins_to
		check_G_matrices(configuration, G0_up, G0_dn, N_up, N_dn, G_up, G_dn);
		#endif // DCA_WITH_QMC_BIT
		}

		/*
		{
		std::cout << "\n\n\t G-TOOLS\n\n";
		G_CPU_tools_obj.build_G_matrix(configuration, N_up_CPU, G0_up_CPU, G_up_CPU, e_UP);
		G_CPU_tools_obj.build_G_matrix(configuration, N_dn_CPU, G0_dn_CPU, G_dn_CPU, e_DN);
		dca::linalg::matrixop::difference(G_up_CPU, G_up);
		dca::linalg::matrixop::difference(G_dn_CPU, G_dn);
		}
		*/

		/*
		#ifdef DCA_WITH_QMC_BIT
		if(concurrency.id()==0 and thread_id==0)
		std::cout << "\t N-update check :" << std::endl;

		N_tools_obj.check_N_matrix(configuration, N_up, G0_up, Gamma_up, e_UP);
		N_tools_obj.check_N_matrix(configuration, N_dn, G0_dn, Gamma_dn, e_DN);
		#endif // DCA_WITH_QMC_BIT
		*/
		}

		template <dca::linalg::DeviceType device_t, class Parameters, class Data, typename Real>
		@@ -1427,23 +1386,6 @@ void CtauxWalker<device_t, Parameters, Data, Real>::clean_up_the_configuration()
		SHRINK_tools_obj.reorganize_configuration_test(configuration, N_up, N_dn, G0_up, G0_dn);

		assert(configuration.assert_consistency());

		// #ifdef DCA_WITH_QMC_BIT
		// check_N_matrices(configuration, G0_up, G0_dn, N_up, N_dn);

		// if (concurrency.id() == concurrency.first()) {
		// std::cout << "\t\t <k> = " <<
		// configuration.get_number_of_interacting_HS_spins()
		// << std::endl;
		// std::cout << "\t\t # creatable spins = " <<
		// configuration.get_number_of_creatable_HS_spins()
		// << std::endl;
		// std::cout << "\t N-woodburry check (2) :" << std::endl;
		// }

		// N_tools_obj.check_N_matrix(configuration, N_up, G0_up, Gamma_up, e_UP);
		// N_tools_obj.check_N_matrix(configuration, N_dn, G0_dn, Gamma_dn, e_DN);
		// #endif // DCA_WITH_QMC_BIT
		}

		template <dca::linalg::DeviceType device_t, class Parameters, class Data, typename Real>
		@@ -1560,6 +1502,10 @@ template <dca::linalg::DeviceType device_t, class Parameters, class Data, typena
		template <typename AccumType>
		const linalg::util::CudaEvent* CtauxWalker<device_t, Parameters, Data, Real>::compute_M(
		std::array<linalg::Matrix<AccumType, device_t>, 2>& Ms) {
		// Stream 1 waits on stream 0.
		sync_streams_event_.record(linalg::util::getStream(thread_id, 0));
		sync_streams_event_.block(linalg::util::getStream(thread_id, 1));

		for (int s = 0; s < 2; ++s) {
		const auto& config = get_configuration().get(s == 0 ? e_UP : e_DN);
		exp_v_minus_one_[s].resizeNoCopy(config.size());

include/dca/phys/dca_step/cluster_solver/ctaux/structs/ct_aux_hs_configuration.hpp

+10 −24

Original line number	Diff line number	Diff line
		@@ -20,6 +20,7 @@
		#include <vector>

		#include "dca/io/buffer.hpp"
		#include "dca/linalg/util/allocators/vectors_typedefs.hpp"
		#include "dca/phys/dca_step/cluster_solver/ctaux/domains/hs_field_sign_domain.hpp"
		#include "dca/phys/dca_step/cluster_solver/ctaux/domains/hs_spin_domain.hpp"
		#include "dca/phys/dca_step/cluster_solver/ctaux/structs/vertex_pair.hpp"
		@@ -70,7 +71,8 @@ public:
		std::vector<int>& get_changed_spin_indices();
		std::vector<HS_spin_states_type>& get_changed_spin_values();

		std::vector<int>& get_changed_spin_indices_e_spin(e_spin_states_type e_spin_type);
		auto& get_changed_spin_indices_e_spin(e_spin_states_type e_spin_type);

		std::vector<HS_spin_states_type>& get_changed_spin_values_e_spin(e_spin_states_type e_spin_type);

		int get_number_of_interacting_HS_spins();
		@@ -127,10 +129,11 @@ private:
		std::vector<int> changed_spin_indices;
		std::vector<HS_spin_states_type> changed_spin_values;

		std::vector<int> changed_spin_indices_e_UP; // = { changed_spin_indices of configuration_e_UP}
		std::vector<HS_spin_states_type> changed_spin_values_e_UP;
		using HostVector = linalg::util::HostVector<int>;
		HostVector changed_spin_indices_e_UP; // = { changed_spin_indices of configuration_e_UP}
		HostVector changed_spin_indices_e_DN; // = { changed_spin_indices of configuration_e_DN}

		std::vector<int> changed_spin_indices_e_DN; // = { changed_spin_indices of configuration_e_DN}
		std::vector<HS_spin_states_type> changed_spin_values_e_UP;
		std::vector<HS_spin_states_type> changed_spin_values_e_DN;

		const int max_num_noninteracting_spins_;
		@@ -144,23 +147,6 @@ CT_AUX_HS_configuration<parameters_type>::CT_AUX_HS_configuration(parameters_typ
		: parameters(parameters_ref),
		rng(rng_ref),

		configuration(),

		configuration_e_UP(0),
		configuration_e_DN(0),

		current_Nb_of_creatable_spins(0),
		current_Nb_of_annihilatable_spins(0),

		changed_spin_indices(0),
		changed_spin_values(0),

		changed_spin_indices_e_UP(0),
		changed_spin_values_e_UP(0),

		changed_spin_indices_e_DN(0),
		changed_spin_values_e_DN(0),

		// Rounding up ensures a value >= 1.
		max_num_noninteracting_spins_((parameters.get_max_submatrix_size() + 1) / 2),

		@@ -426,12 +412,12 @@ void CT_AUX_HS_configuration<parameters_type>::add_delayed_HS_spin(int configura
		template <class parameters_type>
		void CT_AUX_HS_configuration<parameters_type>::add_delayed_HS_spin_to_configuration_e_spin(
		int configuration_index, HS_spin_states_type spin_value) {
		std::vector<int>& changed_spin_indices_e_spin_first =
		auto& changed_spin_indices_e_spin_first =
		get_changed_spin_indices_e_spin(configuration[configuration_index].get_e_spins().first);
		std::vector<HS_spin_states_type>& changed_spin_values_e_spin_first =
		get_changed_spin_values_e_spin(configuration[configuration_index].get_e_spins().first);

		std::vector<int>& changed_spin_indices_e_spin_second =
		auto& changed_spin_indices_e_spin_second =
		get_changed_spin_indices_e_spin(configuration[configuration_index].get_e_spins().second);
		std::vector<HS_spin_states_type>& changed_spin_values_e_spin_second =
		get_changed_spin_values_e_spin(configuration[configuration_index].get_e_spins().second);
		@@ -473,7 +459,7 @@ std::vector<HS_spin_states_type>& CT_AUX_HS_configuration<parameters_type>::get_
		}

		template <class parameters_type>
		std::vector<int>& CT_AUX_HS_configuration<parameters_type>::get_changed_spin_indices_e_spin(
		auto& CT_AUX_HS_configuration<parameters_type>::get_changed_spin_indices_e_spin(
		e_spin_states_type e_spin) {
		if (e_spin == e_UP)
		return changed_spin_indices_e_UP;

include/dca/phys/dca_step/cluster_solver/ctaux/walker/tools/n_matrix_tools/n_matrix_tools_cpu.inc

+9 −6

Original line number	Diff line number	Diff line
		@@ -17,13 +17,16 @@ class N_MATRIX_TOOLS<dca::linalg::CPU,Parameters, Real> {
		typedef typename Parameters::concurrency_type concurrency_type;
		typedef typename Parameters::profiler_type profiler_t;

		template<class T>
		using HostVector = linalg::util::HostVector<T>;

		public:
		N_MATRIX_TOOLS(int id,Parameters& parameters_ref);

		Real* get_device_ptr(dca::linalg::Vector<Real, dca::linalg::CPU>& v);

		int* get_permutation();
		void set_permutation(std::vector<int>& p);
		const int* get_permutation() const;
		void set_permutation(const HostVector<int>& p);

		void set_d_vector(dca::linalg::Vector<Real, dca::linalg::CPU>& d_inv);

		@@ -32,7 +35,7 @@ public:
		void copy_rows(dca::linalg::Matrix<Real, dca::linalg::CPU>& N,
		dca::linalg::Matrix<Real, dca::linalg::CPU>& N_new_spins);

		void compute_G_cols(std::vector<Real>& exp_V, dca::linalg::Matrix<Real, dca::linalg::CPU>& N,
		void compute_G_cols(HostVector<Real>& exp_V, dca::linalg::Matrix<Real, dca::linalg::CPU>& N,
		dca::linalg::Matrix<Real, dca::linalg::CPU>& G,
		dca::linalg::Matrix<Real, dca::linalg::CPU>& G_cols);

		@@ -77,12 +80,12 @@ Real* N_MATRIX_TOOLS<dca::linalg::CPU,Parameters, Real>::get_device_ptr(
		}

		template <class Parameters, typename Real>
		int* N_MATRIX_TOOLS<dca::linalg::CPU,Parameters, Real>::get_permutation() {
		const int* N_MATRIX_TOOLS<dca::linalg::CPU,Parameters, Real>::get_permutation() const {
		return permutation.ptr();
		}

		template <class Parameters, typename Real>
		void N_MATRIX_TOOLS<dca::linalg::CPU,Parameters, Real>::set_permutation(std::vector<int>& p) {
		void N_MATRIX_TOOLS<dca::linalg::CPU,Parameters, Real>::set_permutation(const HostVector<int>& p) {
		permutation = p;
		}

		@@ -113,7 +116,7 @@ void N_MATRIX_TOOLS<dca::linalg::CPU,Parameters, Real>::copy_rows(

		template <class Parameters, typename Real>
		void N_MATRIX_TOOLS<dca::linalg::CPU,Parameters, Real>::compute_G_cols(
		std::vector<Real>& exp_V, dca::linalg::Matrix<Real, dca::linalg::CPU>& N,
		HostVector<Real>& exp_V, dca::linalg::Matrix<Real, dca::linalg::CPU>& N,
		dca::linalg::Matrix<Real, dca::linalg::CPU>& G,
		dca::linalg::Matrix<Real, dca::linalg::CPU>& G_cols) {
		assert(N.nrRows() == G.nrRows());

include/dca/phys/dca_step/cluster_solver/ctaux/walker/tools/n_matrix_tools/n_matrix_tools_gpu.inc

+11 −8

Original line number	Diff line number	Diff line
		@@ -17,11 +17,14 @@ class N_MATRIX_TOOLS<dca::linalg::GPU, Parameters, Real> {
		typedef typename Parameters::concurrency_type concurrency_type;
		typedef typename Parameters::profiler_type profiler_t;

		template<class T>
		using HostVector = linalg::util::HostVector<T>;

		public:
		N_MATRIX_TOOLS(int id, Parameters& parameters_ref);

		int* get_permutation();
		void set_permutation(std::vector<int>& p);
		const int* get_permutation() const;
		void set_permutation(const HostVector<int>& p);

		void set_d_vector(dca::linalg::Vector<Real, dca::linalg::CPU>& d_inv);

		@@ -32,7 +35,7 @@ public:
		void copy_rows(dca::linalg::Matrix<Real, dca::linalg::GPU>& N,
		dca::linalg::Matrix<Real, dca::linalg::GPU>& N_new_spins);

		void compute_G_cols(std::vector<Real>& exp_V, dca::linalg::Matrix<Real, dca::linalg::GPU>& N,
		void compute_G_cols(HostVector <Real> &exp_V, dca::linalg::Matrix<Real, dca::linalg::GPU>& N,
		dca::linalg::Matrix<Real, dca::linalg::GPU>& G,
		dca::linalg::Matrix<Real, dca::linalg::GPU>& G_cols);

		@@ -85,12 +88,12 @@ N_MATRIX_TOOLS<dca::linalg::GPU, Parameters, Real>::N_MATRIX_TOOLS(int id, Param
		}

		template <class Parameters, typename Real>
		int* N_MATRIX_TOOLS<dca::linalg::GPU, Parameters, Real>::get_permutation() {
		const int* N_MATRIX_TOOLS<dca::linalg::GPU, Parameters, Real>::get_permutation() const {
		return permutation.ptr();
		}

		template <class Parameters, typename Real>
		void N_MATRIX_TOOLS<dca::linalg::GPU, Parameters, Real>::set_permutation(std::vector<int>& p) {
		void N_MATRIX_TOOLS<dca::linalg::GPU, Parameters, Real>::set_permutation(const HostVector<int>& p) {
		permutation.setAsync(p, thread_id, stream_id);
		}

		@@ -129,7 +132,7 @@ void N_MATRIX_TOOLS<dca::linalg::GPU, Parameters, Real>::copy_rows(

		template <class Parameters, typename Real>
		void N_MATRIX_TOOLS<dca::linalg::GPU, Parameters, Real>::compute_G_cols(
		std::vector<Real>& exp_V_CPU, dca::linalg::Matrix<Real, dca::linalg::GPU>& N,
		HostVector<Real> &exp_V_CPU, dca::linalg::Matrix<Real, dca::linalg::GPU>& N,
		dca::linalg::Matrix<Real, dca::linalg::GPU>& G,
		dca::linalg::Matrix<Real, dca::linalg::GPU>& G_cols) {
		exp_V.setAsync(exp_V_CPU, linalg::util::getStream(thread_id, stream_id));