Loading include/dca/math/function_transform/special_transforms/space_transform_2D_gpu.hpp +9 −2 Original line number Diff line number Diff line Loading @@ -54,7 +54,8 @@ public: // order of M's labels from (r, b, w) to (b, r, w). // The transform is equivalent to M(k1, k2) = \sum_{r1, r2} exp(i(k1 * r1 - k2 * r2)) M(r1, r2) // In/Out: M void execute(RMatrix& M); // Returns: number of flop. float execute(RMatrix& M); void setWorkspace(const std::shared_ptr<RMatrix>& workspace) { workspace_ = workspace; Loading Loading @@ -107,7 +108,9 @@ SpaceTransform2DGpu<RDmn, KDmn, Real>::SpaceTransform2DGpu(const int nw_pos, mag } template <class RDmn, class KDmn, typename Real> void SpaceTransform2DGpu<RDmn, KDmn, Real>::execute(RMatrix& M) { float SpaceTransform2DGpu<RDmn, KDmn, Real>::execute(RMatrix& M) { float flop = 0.; auto& T_times_M = *(workspace_); auto& T_times_M_times_T = M; Loading @@ -125,6 +128,7 @@ void SpaceTransform2DGpu<RDmn, KDmn, Real>::execute(RMatrix& M) { const int ldb = M.leadingDimension(); const int ldc = T_times_M.leadingDimension(); plan1_.execute('N', 'N', nc_, M.nrCols(), nc_, Complex(1), Complex(0), lda, ldb, ldc); flop += n_trafo * 8. * nc_ * M.nrCols() * nc_; } { Loading @@ -138,10 +142,13 @@ void SpaceTransform2DGpu<RDmn, KDmn, Real>::execute(RMatrix& M) { const int ldc = T_times_M_times_T.leadingDimension(); const Complex norm(1. / nc_); plan2_.execute('N', 'C', M.nrRows(), nc_, nc_, norm, Complex(0), lda, ldb, ldc); flop += n_trafo * 8. * M.nrRows() * nc_ * nc_; } phaseFactorsAndRearrange(T_times_M_times_T, *workspace_); M.swap(*workspace_); return flop; } template <class RDmn, class KDmn, typename Real> Loading include/dca/phys/dca_step/cluster_solver/ctaux/ctaux_accumulator.hpp +3 −3 Original line number Diff line number Diff line Loading @@ -185,8 +185,8 @@ protected: using MC_accumulator_data::DCA_iteration; using MC_accumulator_data::number_of_measurements; using MC_accumulator_data::current_sign; using MC_accumulator_data::accumulated_sign; using MC_accumulator_data::current_sign; const bool compute_std_deviation_; Loading Loading @@ -464,12 +464,12 @@ void CtauxAccumulator<device_t, Parameters, Data>::accumulate_equal_time_quantit template <dca::linalg::DeviceType device_t, class Parameters, class Data> void CtauxAccumulator<device_t, Parameters, Data>::accumulate_two_particle_quantities() { profiler_type profiler("tp-accumulation", "CT-AUX accumulator", __LINE__, thread_id); /*GFLOP +=*/two_particle_accumulator_.accumulate(M_, hs_configuration_, current_sign); GFLOP += 1e-9 * two_particle_accumulator_.accumulate(M_, hs_configuration_, current_sign); } template <dca::linalg::DeviceType device_t, class Parameters, class Data> void CtauxAccumulator<device_t, Parameters, Data>::sumTo(this_type& other) { other.get_Gflop() += get_Gflop(); other.GFLOP += GFLOP; other.accumulated_sign += accumulated_sign; other.number_of_measurements += number_of_measurements; Loading include/dca/phys/dca_step/cluster_solver/ctaux/ctaux_cluster_solver.hpp +5 −2 Original line number Diff line number Diff line Loading @@ -420,6 +420,7 @@ void CtauxClusterSolver<device_t, Parameters, Data>::collect_measurements() { concurrency_.sum(f); }; const double local_time = total_time_; { Profiler profiler("Scalars", "QMC-collectives", __LINE__); concurrency_.sum(total_time_); Loading @@ -430,8 +431,10 @@ void CtauxClusterSolver<device_t, Parameters, Data>::collect_measurements() { if (concurrency_.id() == concurrency_.first()) std::cout << "\n\t\t Collect measurements \t" << dca::util::print_time() << "\n" << "\n\t\t\t QMC-time : " << total_time_ << " [sec]" << "\n\t\t\t Gflops : " << accumulator_.get_Gflop() / total_time_ << " [Gf]" << "\n\t\t\t QMC-local-time : " << local_time << " [sec]" << "\n\t\t\t QMC-total-time : " << total_time_ << " [sec]" << "\n\t\t\t Gflop : " << accumulator_.get_Gflop() << " [Gf]" << "\n\t\t\t Gflop/s : " << accumulator_.get_Gflop() / local_time << " [Gf/s]" << "\n\t\t\t sign : " << accumulated_sign_ / parameters_.get_measurements() << " \n"; Loading include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/kernels_interface.hpp +1 −1 Original line number Diff line number Diff line Loading @@ -33,7 +33,7 @@ void computeGMultiband(std::complex<Real>* G, int ldg, const std::complex<Real>* int nb, int nk, int nw_pos, Real beta, cudaStream_t stream); template <typename Real, FourPointType type> void updateG4(std::complex<Real>* G4, const std::complex<Real>* G_up, const int lggu, float updateG4(std::complex<Real>* G4, const std::complex<Real>* G_up, const int lggu, const std::complex<Real>* G_down, const int ldgd, const int nb, const int nk, const int nw_pos, const int nw_exchange, const int nk_exchange, const int sign, bool atomic, cudaStream_t stream); Loading include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/ndft/cached_ndft_cpu.hpp +11 −11 Original line number Diff line number Diff line Loading @@ -52,7 +52,7 @@ public: // Returns: the number of gigaflops performed by the method. // TODO: remove the gigaflops if they are not necessary. template <class Configuration, typename ScalarInp, class OutDmn> double execute(const Configuration& configuration, const linalg::Matrix<ScalarInp, linalg::CPU>& M, float execute(const Configuration& configuration, const linalg::Matrix<ScalarInp, linalg::CPU>& M, func::function<std::complex<ScalarType>, OutDmn>& M_r_r_w_w, int spin = 0); private: Loading @@ -64,7 +64,7 @@ private: void computeTSubmatrices(int orb_i, int orb_j); double executeTrimmedFT(); float executeTrimmedFT(); void inline copyPartialResult( int orb1, int orb2, int /*spin*/, Loading Loading @@ -110,12 +110,12 @@ private: template <typename ScalarType, class RDmn, class WDmn, class WPosDmn, bool non_density_density> template <class Configuration, typename ScalarInp, class OutDmn> double CachedNdft<ScalarType, RDmn, WDmn, WPosDmn, linalg::CPU, non_density_density>::execute( float CachedNdft<ScalarType, RDmn, WDmn, WPosDmn, linalg::CPU, non_density_density>::execute( const Configuration& configuration, const linalg::Matrix<ScalarInp, linalg::CPU>& M, func::function<std::complex<ScalarType>, OutDmn>& M_r_r_w_w, const int spin) { assert(M_r_r_w_w[M_r_r_w_w.signature() - 1] == WDmn::dmn_size()); assert(M_r_r_w_w[M_r_r_w_w.signature() - 2] == WPosDmn::dmn_size()); double gflop = 0.; double flops = 0.; BaseClass::sortConfiguration(configuration); Loading @@ -132,7 +132,7 @@ double CachedNdft<ScalarType, RDmn, WDmn, WPosDmn, linalg::CPU, non_density_dens computeTSubmatrices(orb_i, orb_j); gflop += executeTrimmedFT(); flops += executeTrimmedFT(); copyPartialResult(orb_i, orb_j, spin, M_r_r_w_w); } Loading @@ -141,7 +141,7 @@ double CachedNdft<ScalarType, RDmn, WDmn, WPosDmn, linalg::CPU, non_density_dens } } return gflop; return flops; } template <typename ScalarType, class RDmn, class WDmn, class WPosDmn, bool non_density_density> Loading Loading @@ -209,8 +209,8 @@ void CachedNdft<ScalarType, RDmn, WDmn, WPosDmn, linalg::CPU, non_density_densit } template <typename ScalarType, class RDmn, class WDmn, class WPosDmn, bool non_density_density> double CachedNdft<ScalarType, RDmn, WDmn, WPosDmn, linalg::CPU, non_density_density>::executeTrimmedFT() { double flop = 0.; float CachedNdft<ScalarType, RDmn, WDmn, WPosDmn, linalg::CPU, non_density_density>::executeTrimmedFT() { float flops = 0.; assert(WPosDmn::dmn_size() == WDmn::dmn_size() / 2); Loading @@ -227,13 +227,13 @@ double CachedNdft<ScalarType, RDmn, WDmn, WPosDmn, linalg::CPU, non_density_dens } dca::linalg::matrixop::multiply(T_l_, M_ij_, T_l_times_M_ij_); flop += 4 * T_l_[0].size().first * T_l_[0].size().second * M_ij_.size().second; flops += 4 * T_l_[0].size().first * T_l_[0].size().second * M_ij_.size().second; dca::linalg::matrixop::multiply('N', 'C', T_l_times_M_ij_, T_r_, T_l_times_M_ij_times_T_r_, work_); flop += 4. * T_l_times_M_ij_[0].size().first * T_l_times_M_ij_[0].size().second * flops += 8. * T_l_times_M_ij_[0].size().first * T_l_times_M_ij_[0].size().second * T_l_times_M_ij_times_T_r_[0].size().second; return 1e-9 * flop; return flops; } template <typename ScalarType, class RDmn, class WDmn, class WPosDmn, bool non_density_density> Loading Loading
include/dca/math/function_transform/special_transforms/space_transform_2D_gpu.hpp +9 −2 Original line number Diff line number Diff line Loading @@ -54,7 +54,8 @@ public: // order of M's labels from (r, b, w) to (b, r, w). // The transform is equivalent to M(k1, k2) = \sum_{r1, r2} exp(i(k1 * r1 - k2 * r2)) M(r1, r2) // In/Out: M void execute(RMatrix& M); // Returns: number of flop. float execute(RMatrix& M); void setWorkspace(const std::shared_ptr<RMatrix>& workspace) { workspace_ = workspace; Loading Loading @@ -107,7 +108,9 @@ SpaceTransform2DGpu<RDmn, KDmn, Real>::SpaceTransform2DGpu(const int nw_pos, mag } template <class RDmn, class KDmn, typename Real> void SpaceTransform2DGpu<RDmn, KDmn, Real>::execute(RMatrix& M) { float SpaceTransform2DGpu<RDmn, KDmn, Real>::execute(RMatrix& M) { float flop = 0.; auto& T_times_M = *(workspace_); auto& T_times_M_times_T = M; Loading @@ -125,6 +128,7 @@ void SpaceTransform2DGpu<RDmn, KDmn, Real>::execute(RMatrix& M) { const int ldb = M.leadingDimension(); const int ldc = T_times_M.leadingDimension(); plan1_.execute('N', 'N', nc_, M.nrCols(), nc_, Complex(1), Complex(0), lda, ldb, ldc); flop += n_trafo * 8. * nc_ * M.nrCols() * nc_; } { Loading @@ -138,10 +142,13 @@ void SpaceTransform2DGpu<RDmn, KDmn, Real>::execute(RMatrix& M) { const int ldc = T_times_M_times_T.leadingDimension(); const Complex norm(1. / nc_); plan2_.execute('N', 'C', M.nrRows(), nc_, nc_, norm, Complex(0), lda, ldb, ldc); flop += n_trafo * 8. * M.nrRows() * nc_ * nc_; } phaseFactorsAndRearrange(T_times_M_times_T, *workspace_); M.swap(*workspace_); return flop; } template <class RDmn, class KDmn, typename Real> Loading
include/dca/phys/dca_step/cluster_solver/ctaux/ctaux_accumulator.hpp +3 −3 Original line number Diff line number Diff line Loading @@ -185,8 +185,8 @@ protected: using MC_accumulator_data::DCA_iteration; using MC_accumulator_data::number_of_measurements; using MC_accumulator_data::current_sign; using MC_accumulator_data::accumulated_sign; using MC_accumulator_data::current_sign; const bool compute_std_deviation_; Loading Loading @@ -464,12 +464,12 @@ void CtauxAccumulator<device_t, Parameters, Data>::accumulate_equal_time_quantit template <dca::linalg::DeviceType device_t, class Parameters, class Data> void CtauxAccumulator<device_t, Parameters, Data>::accumulate_two_particle_quantities() { profiler_type profiler("tp-accumulation", "CT-AUX accumulator", __LINE__, thread_id); /*GFLOP +=*/two_particle_accumulator_.accumulate(M_, hs_configuration_, current_sign); GFLOP += 1e-9 * two_particle_accumulator_.accumulate(M_, hs_configuration_, current_sign); } template <dca::linalg::DeviceType device_t, class Parameters, class Data> void CtauxAccumulator<device_t, Parameters, Data>::sumTo(this_type& other) { other.get_Gflop() += get_Gflop(); other.GFLOP += GFLOP; other.accumulated_sign += accumulated_sign; other.number_of_measurements += number_of_measurements; Loading
include/dca/phys/dca_step/cluster_solver/ctaux/ctaux_cluster_solver.hpp +5 −2 Original line number Diff line number Diff line Loading @@ -420,6 +420,7 @@ void CtauxClusterSolver<device_t, Parameters, Data>::collect_measurements() { concurrency_.sum(f); }; const double local_time = total_time_; { Profiler profiler("Scalars", "QMC-collectives", __LINE__); concurrency_.sum(total_time_); Loading @@ -430,8 +431,10 @@ void CtauxClusterSolver<device_t, Parameters, Data>::collect_measurements() { if (concurrency_.id() == concurrency_.first()) std::cout << "\n\t\t Collect measurements \t" << dca::util::print_time() << "\n" << "\n\t\t\t QMC-time : " << total_time_ << " [sec]" << "\n\t\t\t Gflops : " << accumulator_.get_Gflop() / total_time_ << " [Gf]" << "\n\t\t\t QMC-local-time : " << local_time << " [sec]" << "\n\t\t\t QMC-total-time : " << total_time_ << " [sec]" << "\n\t\t\t Gflop : " << accumulator_.get_Gflop() << " [Gf]" << "\n\t\t\t Gflop/s : " << accumulator_.get_Gflop() / local_time << " [Gf/s]" << "\n\t\t\t sign : " << accumulated_sign_ / parameters_.get_measurements() << " \n"; Loading
include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/kernels_interface.hpp +1 −1 Original line number Diff line number Diff line Loading @@ -33,7 +33,7 @@ void computeGMultiband(std::complex<Real>* G, int ldg, const std::complex<Real>* int nb, int nk, int nw_pos, Real beta, cudaStream_t stream); template <typename Real, FourPointType type> void updateG4(std::complex<Real>* G4, const std::complex<Real>* G_up, const int lggu, float updateG4(std::complex<Real>* G4, const std::complex<Real>* G_up, const int lggu, const std::complex<Real>* G_down, const int ldgd, const int nb, const int nk, const int nw_pos, const int nw_exchange, const int nk_exchange, const int sign, bool atomic, cudaStream_t stream); Loading
include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/ndft/cached_ndft_cpu.hpp +11 −11 Original line number Diff line number Diff line Loading @@ -52,7 +52,7 @@ public: // Returns: the number of gigaflops performed by the method. // TODO: remove the gigaflops if they are not necessary. template <class Configuration, typename ScalarInp, class OutDmn> double execute(const Configuration& configuration, const linalg::Matrix<ScalarInp, linalg::CPU>& M, float execute(const Configuration& configuration, const linalg::Matrix<ScalarInp, linalg::CPU>& M, func::function<std::complex<ScalarType>, OutDmn>& M_r_r_w_w, int spin = 0); private: Loading @@ -64,7 +64,7 @@ private: void computeTSubmatrices(int orb_i, int orb_j); double executeTrimmedFT(); float executeTrimmedFT(); void inline copyPartialResult( int orb1, int orb2, int /*spin*/, Loading Loading @@ -110,12 +110,12 @@ private: template <typename ScalarType, class RDmn, class WDmn, class WPosDmn, bool non_density_density> template <class Configuration, typename ScalarInp, class OutDmn> double CachedNdft<ScalarType, RDmn, WDmn, WPosDmn, linalg::CPU, non_density_density>::execute( float CachedNdft<ScalarType, RDmn, WDmn, WPosDmn, linalg::CPU, non_density_density>::execute( const Configuration& configuration, const linalg::Matrix<ScalarInp, linalg::CPU>& M, func::function<std::complex<ScalarType>, OutDmn>& M_r_r_w_w, const int spin) { assert(M_r_r_w_w[M_r_r_w_w.signature() - 1] == WDmn::dmn_size()); assert(M_r_r_w_w[M_r_r_w_w.signature() - 2] == WPosDmn::dmn_size()); double gflop = 0.; double flops = 0.; BaseClass::sortConfiguration(configuration); Loading @@ -132,7 +132,7 @@ double CachedNdft<ScalarType, RDmn, WDmn, WPosDmn, linalg::CPU, non_density_dens computeTSubmatrices(orb_i, orb_j); gflop += executeTrimmedFT(); flops += executeTrimmedFT(); copyPartialResult(orb_i, orb_j, spin, M_r_r_w_w); } Loading @@ -141,7 +141,7 @@ double CachedNdft<ScalarType, RDmn, WDmn, WPosDmn, linalg::CPU, non_density_dens } } return gflop; return flops; } template <typename ScalarType, class RDmn, class WDmn, class WPosDmn, bool non_density_density> Loading Loading @@ -209,8 +209,8 @@ void CachedNdft<ScalarType, RDmn, WDmn, WPosDmn, linalg::CPU, non_density_densit } template <typename ScalarType, class RDmn, class WDmn, class WPosDmn, bool non_density_density> double CachedNdft<ScalarType, RDmn, WDmn, WPosDmn, linalg::CPU, non_density_density>::executeTrimmedFT() { double flop = 0.; float CachedNdft<ScalarType, RDmn, WDmn, WPosDmn, linalg::CPU, non_density_density>::executeTrimmedFT() { float flops = 0.; assert(WPosDmn::dmn_size() == WDmn::dmn_size() / 2); Loading @@ -227,13 +227,13 @@ double CachedNdft<ScalarType, RDmn, WDmn, WPosDmn, linalg::CPU, non_density_dens } dca::linalg::matrixop::multiply(T_l_, M_ij_, T_l_times_M_ij_); flop += 4 * T_l_[0].size().first * T_l_[0].size().second * M_ij_.size().second; flops += 4 * T_l_[0].size().first * T_l_[0].size().second * M_ij_.size().second; dca::linalg::matrixop::multiply('N', 'C', T_l_times_M_ij_, T_r_, T_l_times_M_ij_times_T_r_, work_); flop += 4. * T_l_times_M_ij_[0].size().first * T_l_times_M_ij_[0].size().second * flops += 8. * T_l_times_M_ij_[0].size().first * T_l_times_M_ij_[0].size().second * T_l_times_M_ij_times_T_r_[0].size().second; return 1e-9 * flop; return flops; } template <typename ScalarType, class RDmn, class WDmn, class WPosDmn, bool non_density_density> Loading