Extended and fixed flop computation. (9d4e80e0) · Commits · NDIP / Tool Sources / Direct-Geometry Spectroscopy / DCA / DCA Main

include/dca/math/function_transform/special_transforms/space_transform_2D_gpu.hpp

+9 −2

Original line number	Diff line number	Diff line
		@@ -54,7 +54,8 @@ public:
		// order of M's labels from (r, b, w) to (b, r, w).
		// The transform is equivalent to M(k1, k2) = \sum_{r1, r2} exp(i(k1 * r1 - k2 * r2)) M(r1, r2)
		// In/Out: M
		void execute(RMatrix& M);
		// Returns: number of flop.
		float execute(RMatrix& M);

		void setWorkspace(const std::shared_ptr<RMatrix>& workspace) {
		workspace_ = workspace;
		@@ -107,7 +108,9 @@ SpaceTransform2DGpu<RDmn, KDmn, Real>::SpaceTransform2DGpu(const int nw_pos, mag
		}

		template <class RDmn, class KDmn, typename Real>
		void SpaceTransform2DGpu<RDmn, KDmn, Real>::execute(RMatrix& M) {
		float SpaceTransform2DGpu<RDmn, KDmn, Real>::execute(RMatrix& M) {
		float flop = 0.;

		auto& T_times_M = *(workspace_);
		auto& T_times_M_times_T = M;

		@@ -125,6 +128,7 @@ void SpaceTransform2DGpu<RDmn, KDmn, Real>::execute(RMatrix& M) {
		const int ldb = M.leadingDimension();
		const int ldc = T_times_M.leadingDimension();
		plan1_.execute('N', 'N', nc_, M.nrCols(), nc_, Complex(1), Complex(0), lda, ldb, ldc);
		flop += n_trafo * 8. * nc_ * M.nrCols() * nc_;
		}

		{
		@@ -138,10 +142,13 @@ void SpaceTransform2DGpu<RDmn, KDmn, Real>::execute(RMatrix& M) {
		const int ldc = T_times_M_times_T.leadingDimension();
		const Complex norm(1. / nc_);
		plan2_.execute('N', 'C', M.nrRows(), nc_, nc_, norm, Complex(0), lda, ldb, ldc);
		flop += n_trafo * 8. * M.nrRows() * nc_ * nc_;
		}

		phaseFactorsAndRearrange(T_times_M_times_T, *workspace_);
		M.swap(*workspace_);

		return flop;
		}

		template <class RDmn, class KDmn, typename Real>

include/dca/phys/dca_step/cluster_solver/ctaux/ctaux_accumulator.hpp

+3 −3

Original line number	Diff line number	Diff line
		@@ -185,8 +185,8 @@ protected:
		using MC_accumulator_data::DCA_iteration;
		using MC_accumulator_data::number_of_measurements;

		using MC_accumulator_data::current_sign;
		using MC_accumulator_data::accumulated_sign;
		using MC_accumulator_data::current_sign;

		const bool compute_std_deviation_;

		@@ -464,12 +464,12 @@ void CtauxAccumulator<device_t, Parameters, Data>::accumulate_equal_time_quantit
		template <dca::linalg::DeviceType device_t, class Parameters, class Data>
		void CtauxAccumulator<device_t, Parameters, Data>::accumulate_two_particle_quantities() {
		profiler_type profiler("tp-accumulation", "CT-AUX accumulator", __LINE__, thread_id);
		/GFLOP +=/two_particle_accumulator_.accumulate(M_, hs_configuration_, current_sign);
		GFLOP += 1e-9 * two_particle_accumulator_.accumulate(M_, hs_configuration_, current_sign);
		}

		template <dca::linalg::DeviceType device_t, class Parameters, class Data>
		void CtauxAccumulator<device_t, Parameters, Data>::sumTo(this_type& other) {
		other.get_Gflop() += get_Gflop();
		other.GFLOP += GFLOP;

		other.accumulated_sign += accumulated_sign;
		other.number_of_measurements += number_of_measurements;

include/dca/phys/dca_step/cluster_solver/ctaux/ctaux_cluster_solver.hpp

+5 −2

Original line number	Diff line number	Diff line
		@@ -420,6 +420,7 @@ void CtauxClusterSolver<device_t, Parameters, Data>::collect_measurements() {
		concurrency_.sum(f);
		};

		const double local_time = total_time_;
		{
		Profiler profiler("Scalars", "QMC-collectives", __LINE__);
		concurrency_.sum(total_time_);
		@@ -430,8 +431,10 @@ void CtauxClusterSolver<device_t, Parameters, Data>::collect_measurements() {

		if (concurrency_.id() == concurrency_.first())
		std::cout << "\n\t\t Collect measurements \t" << dca::util::print_time() << "\n"
		<< "\n\t\t\t QMC-time : " << total_time_ << " [sec]"
		<< "\n\t\t\t Gflops : " << accumulator_.get_Gflop() / total_time_ << " [Gf]"
		<< "\n\t\t\t QMC-local-time : " << local_time << " [sec]"
		<< "\n\t\t\t QMC-total-time : " << total_time_ << " [sec]"
		<< "\n\t\t\t Gflop : " << accumulator_.get_Gflop() << " [Gf]"
		<< "\n\t\t\t Gflop/s : " << accumulator_.get_Gflop() / local_time << " [Gf/s]"
		<< "\n\t\t\t sign : " << accumulated_sign_ / parameters_.get_measurements()
		<< " \n";

include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/kernels_interface.hpp

+1 −1

Original line number	Diff line number	Diff line
		@@ -33,7 +33,7 @@ void computeGMultiband(std::complex<Real>* G, int ldg, const std::complex<Real>*
		int nb, int nk, int nw_pos, Real beta, cudaStream_t stream);

		template <typename Real, FourPointType type>
		void updateG4(std::complex<Real>* G4, const std::complex<Real>* G_up, const int lggu,
		float updateG4(std::complex<Real>* G4, const std::complex<Real>* G_up, const int lggu,
		const std::complex<Real>* G_down, const int ldgd, const int nb, const int nk,
		const int nw_pos, const int nw_exchange, const int nk_exchange, const int sign,
		bool atomic, cudaStream_t stream);

include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/ndft/cached_ndft_cpu.hpp

+11 −11

Original line number	Diff line number	Diff line
		@@ -52,7 +52,7 @@ public:
		// Returns: the number of gigaflops performed by the method.
		// TODO: remove the gigaflops if they are not necessary.
		template <class Configuration, typename ScalarInp, class OutDmn>
		double execute(const Configuration& configuration, const linalg::Matrix<ScalarInp, linalg::CPU>& M,
		float execute(const Configuration& configuration, const linalg::Matrix<ScalarInp, linalg::CPU>& M,
		func::function<std::complex<ScalarType>, OutDmn>& M_r_r_w_w, int spin = 0);

		private:
		@@ -64,7 +64,7 @@ private:

		void computeTSubmatrices(int orb_i, int orb_j);

		double executeTrimmedFT();
		float executeTrimmedFT();

		void inline copyPartialResult(
		int orb1, int orb2, int /spin/,
		@@ -110,12 +110,12 @@ private:

		template <typename ScalarType, class RDmn, class WDmn, class WPosDmn, bool non_density_density>
		template <class Configuration, typename ScalarInp, class OutDmn>
		double CachedNdft<ScalarType, RDmn, WDmn, WPosDmn, linalg::CPU, non_density_density>::execute(
		float CachedNdft<ScalarType, RDmn, WDmn, WPosDmn, linalg::CPU, non_density_density>::execute(
		const Configuration& configuration, const linalg::Matrix<ScalarInp, linalg::CPU>& M,
		func::function<std::complex<ScalarType>, OutDmn>& M_r_r_w_w, const int spin) {
		assert(M_r_r_w_w[M_r_r_w_w.signature() - 1] == WDmn::dmn_size());
		assert(M_r_r_w_w[M_r_r_w_w.signature() - 2] == WPosDmn::dmn_size());
		double gflop = 0.;
		double flops = 0.;

		BaseClass::sortConfiguration(configuration);

		@@ -132,7 +132,7 @@ double CachedNdft<ScalarType, RDmn, WDmn, WPosDmn, linalg::CPU, non_density_dens

		computeTSubmatrices(orb_i, orb_j);

		gflop += executeTrimmedFT();
		flops += executeTrimmedFT();

		copyPartialResult(orb_i, orb_j, spin, M_r_r_w_w);
		}
		@@ -141,7 +141,7 @@ double CachedNdft<ScalarType, RDmn, WDmn, WPosDmn, linalg::CPU, non_density_dens
		}
		}

		return gflop;
		return flops;
		}

		template <typename ScalarType, class RDmn, class WDmn, class WPosDmn, bool non_density_density>
		@@ -209,8 +209,8 @@ void CachedNdft<ScalarType, RDmn, WDmn, WPosDmn, linalg::CPU, non_density_densit
		}

		template <typename ScalarType, class RDmn, class WDmn, class WPosDmn, bool non_density_density>
		double CachedNdft<ScalarType, RDmn, WDmn, WPosDmn, linalg::CPU, non_density_density>::executeTrimmedFT() {
		double flop = 0.;
		float CachedNdft<ScalarType, RDmn, WDmn, WPosDmn, linalg::CPU, non_density_density>::executeTrimmedFT() {
		float flops = 0.;

		assert(WPosDmn::dmn_size() == WDmn::dmn_size() / 2);

		@@ -227,13 +227,13 @@ double CachedNdft<ScalarType, RDmn, WDmn, WPosDmn, linalg::CPU, non_density_dens
		}

		dca::linalg::matrixop::multiply(T_l_, M_ij_, T_l_times_M_ij_);
		flop += 4 * T_l_[0].size().first * T_l_[0].size().second * M_ij_.size().second;
		flops += 4 * T_l_[0].size().first * T_l_[0].size().second * M_ij_.size().second;

		dca::linalg::matrixop::multiply('N', 'C', T_l_times_M_ij_, T_r_, T_l_times_M_ij_times_T_r_, work_);
		flop += 4. * T_l_times_M_ij_[0].size().first * T_l_times_M_ij_[0].size().second *
		flops += 8. * T_l_times_M_ij_[0].size().first * T_l_times_M_ij_[0].size().second *
		T_l_times_M_ij_times_T_r_[0].size().second;

		return 1e-9 * flop;
		return flops;
		}

		template <typename ScalarType, class RDmn, class WDmn, class WPosDmn, bool non_density_density>