Loading include/dca/parallel/mpi_concurrency/mpi_collective_sum.hpp +26 −4 Original line number Diff line number Diff line Loading @@ -59,6 +59,9 @@ public: template <typename scalar_type> void sum(linalg::Matrix<scalar_type, linalg::CPU>& f) const; template <typename Scalar, class Domain> void localSum(func::function<Scalar, Domain>& f, int root_id) const; template <typename Scalar> void delayedSum(Scalar& obj); template <typename Scalar> Loading Loading @@ -152,8 +155,9 @@ public: const std::vector<int>& orders) const; private: // Compute the sum on process id 'rank', or all processes if rank == -1. template <typename T> void sum(const T* in, T* out, std::size_t n) const; void sum(const T* in, T* out, std::size_t n, int rank = -1) const; template <typename T> void delayedSum(T* in, std::size_t n); Loading Loading @@ -280,6 +284,18 @@ void MPICollectiveSum::sum(linalg::Matrix<scalar_type, linalg::CPU>& f) const { f = std::move(F); } template <typename scalar_type, class domain> void MPICollectiveSum::localSum(func::function<scalar_type, domain>& f, int id) const { if (id < 0 || id > get_size()) throw(std::out_of_range("id out of range.")); func::function<scalar_type, domain> f_sum; sum(f.values(), f_sum.values(), f.size(), id); f = std::move(f_sum); } template <typename some_type> void MPICollectiveSum::sum_and_average(some_type& obj, const int nr_meas_rank) const { sum(obj); Loading Loading @@ -544,7 +560,7 @@ std::vector<Scalar> MPICollectiveSum::avgNormalizedMomenta(const func::function< } template <typename T> void MPICollectiveSum::sum(const T* in, T* out, std::size_t n) const { void MPICollectiveSum::sum(const T* in, T* out, std::size_t n, int id) const { // On summit large messages hangs if sizeof(floating point type) type * message_size > 2^31-1. constexpr std::size_t max_size = dca::util::IsComplex<T>::value ? 2 * (std::numeric_limits<int>::max() / sizeof(T)) Loading @@ -552,9 +568,15 @@ void MPICollectiveSum::sum(const T* in, T* out, std::size_t n) const { for (std::size_t start = 0; start < n; start += max_size) { const int msg_size = std::min(n - start, max_size); if (id == -1) { MPI_Allreduce(in + start, out + start, msg_size, MPITypeMap<T>::value(), MPI_SUM, MPIProcessorGrouping::get()); } else { MPI_Reduce(in + start, out + start, msg_size, MPITypeMap<T>::value(), MPI_SUM, id, MPIProcessorGrouping::get()); } } } template <typename Scalar> Loading include/dca/parallel/no_concurrency/serial_collective_sum.hpp +3 −0 Original line number Diff line number Diff line Loading @@ -36,6 +36,9 @@ public: template <class T1, class T2> void sum(const T1&, T2&) const {} template<class T> void localSum(const T& , int ){} template <class T> void delayedSum(T&) const {} void resolveSums() const {} Loading include/dca/phys/dca_step/cluster_solver/ctaux/ctaux_cluster_solver.hpp +16 −16 Original line number Diff line number Diff line Loading @@ -280,15 +280,11 @@ double CtauxClusterSolver<device_t, Parameters, Data>::finalize(dca_info_struct_ } } if (dca_iteration_ == parameters_.get_dca_iterations() - 1 && parameters_.accumulateG4()) { for (auto& G4_channel : data_.get_G4()) G4_channel /= parameters_.get_beta() * parameters_.get_beta(); if (compute_jack_knife_) { for (std::size_t channel = 0; channel < data_.get_G4().size(); ++channel) if (compute_jack_knife_ && parameters_.accumulateG4()) { for (std::size_t channel = 0; channel < parameters_.numG4Channels(); ++channel) data_.get_G4_error()[channel] = concurrency_.jackknifeError(data_.get_G4()[channel], true); } } double total = 1.e-6, integral = 0; for (int l = 0; l < accumulator_.get_visited_expansion_order_k().size(); l++) { Loading Loading @@ -421,6 +417,8 @@ void CtauxClusterSolver<device_t, Parameters, Data>::collect_measurements() { }; const double local_time = total_time_; const bool accumulate_g4 = parameters_.accumulateG4() && dca_iteration_ == parameters_.get_dca_iterations() - 1; { Profiler profiler("QMC-collectives", "CT-AUX solver", __LINE__); Loading @@ -447,14 +445,14 @@ void CtauxClusterSolver<device_t, Parameters, Data>::collect_measurements() { } // sum G4 if (parameters_.accumulateG4() && dca_iteration_ == parameters_.get_dca_iterations() - 1) { for (int g4_idx = 0; g4_idx < data_.get_G4().size(); ++g4_idx) { auto& G4 = data_.get_G4()[g4_idx]; G4 = accumulator_.get_sign_times_G4()[g4_idx]; if (accumulate_g4) { for (int channel = 0; channel < parameters_.numG4Channels(); ++channel) { auto& G4 = data_.get_G4()[channel]; G4 = accumulator_.get_sign_times_G4()[channel]; if (compute_jack_knife_) concurrency_.leaveOneOutSum(G4, true); concurrency_.leaveOneOutSum(G4); else concurrency_.delayedSum(G4); // TODO: reduce only on rank 0. concurrency_.localSum(G4, concurrency_.first()); } } Loading @@ -466,8 +464,10 @@ void CtauxClusterSolver<device_t, Parameters, Data>::collect_measurements() { M_r_w_ /= accumulated_sign_; M_r_w_squared_ /= accumulated_sign_; if(accumulate_g4) { for (auto &G4 : data_.get_G4()) G4 /= accumulated_sign_ * parameters_.get_beta() * parameters_.get_beta(); } if (parameters_.additional_time_measurements()) { accumulator_.get_G_r_t() /= accumulated_sign_; Loading Loading
include/dca/parallel/mpi_concurrency/mpi_collective_sum.hpp +26 −4 Original line number Diff line number Diff line Loading @@ -59,6 +59,9 @@ public: template <typename scalar_type> void sum(linalg::Matrix<scalar_type, linalg::CPU>& f) const; template <typename Scalar, class Domain> void localSum(func::function<Scalar, Domain>& f, int root_id) const; template <typename Scalar> void delayedSum(Scalar& obj); template <typename Scalar> Loading Loading @@ -152,8 +155,9 @@ public: const std::vector<int>& orders) const; private: // Compute the sum on process id 'rank', or all processes if rank == -1. template <typename T> void sum(const T* in, T* out, std::size_t n) const; void sum(const T* in, T* out, std::size_t n, int rank = -1) const; template <typename T> void delayedSum(T* in, std::size_t n); Loading Loading @@ -280,6 +284,18 @@ void MPICollectiveSum::sum(linalg::Matrix<scalar_type, linalg::CPU>& f) const { f = std::move(F); } template <typename scalar_type, class domain> void MPICollectiveSum::localSum(func::function<scalar_type, domain>& f, int id) const { if (id < 0 || id > get_size()) throw(std::out_of_range("id out of range.")); func::function<scalar_type, domain> f_sum; sum(f.values(), f_sum.values(), f.size(), id); f = std::move(f_sum); } template <typename some_type> void MPICollectiveSum::sum_and_average(some_type& obj, const int nr_meas_rank) const { sum(obj); Loading Loading @@ -544,7 +560,7 @@ std::vector<Scalar> MPICollectiveSum::avgNormalizedMomenta(const func::function< } template <typename T> void MPICollectiveSum::sum(const T* in, T* out, std::size_t n) const { void MPICollectiveSum::sum(const T* in, T* out, std::size_t n, int id) const { // On summit large messages hangs if sizeof(floating point type) type * message_size > 2^31-1. constexpr std::size_t max_size = dca::util::IsComplex<T>::value ? 2 * (std::numeric_limits<int>::max() / sizeof(T)) Loading @@ -552,9 +568,15 @@ void MPICollectiveSum::sum(const T* in, T* out, std::size_t n) const { for (std::size_t start = 0; start < n; start += max_size) { const int msg_size = std::min(n - start, max_size); if (id == -1) { MPI_Allreduce(in + start, out + start, msg_size, MPITypeMap<T>::value(), MPI_SUM, MPIProcessorGrouping::get()); } else { MPI_Reduce(in + start, out + start, msg_size, MPITypeMap<T>::value(), MPI_SUM, id, MPIProcessorGrouping::get()); } } } template <typename Scalar> Loading
include/dca/parallel/no_concurrency/serial_collective_sum.hpp +3 −0 Original line number Diff line number Diff line Loading @@ -36,6 +36,9 @@ public: template <class T1, class T2> void sum(const T1&, T2&) const {} template<class T> void localSum(const T& , int ){} template <class T> void delayedSum(T&) const {} void resolveSums() const {} Loading
include/dca/phys/dca_step/cluster_solver/ctaux/ctaux_cluster_solver.hpp +16 −16 Original line number Diff line number Diff line Loading @@ -280,15 +280,11 @@ double CtauxClusterSolver<device_t, Parameters, Data>::finalize(dca_info_struct_ } } if (dca_iteration_ == parameters_.get_dca_iterations() - 1 && parameters_.accumulateG4()) { for (auto& G4_channel : data_.get_G4()) G4_channel /= parameters_.get_beta() * parameters_.get_beta(); if (compute_jack_knife_) { for (std::size_t channel = 0; channel < data_.get_G4().size(); ++channel) if (compute_jack_knife_ && parameters_.accumulateG4()) { for (std::size_t channel = 0; channel < parameters_.numG4Channels(); ++channel) data_.get_G4_error()[channel] = concurrency_.jackknifeError(data_.get_G4()[channel], true); } } double total = 1.e-6, integral = 0; for (int l = 0; l < accumulator_.get_visited_expansion_order_k().size(); l++) { Loading Loading @@ -421,6 +417,8 @@ void CtauxClusterSolver<device_t, Parameters, Data>::collect_measurements() { }; const double local_time = total_time_; const bool accumulate_g4 = parameters_.accumulateG4() && dca_iteration_ == parameters_.get_dca_iterations() - 1; { Profiler profiler("QMC-collectives", "CT-AUX solver", __LINE__); Loading @@ -447,14 +445,14 @@ void CtauxClusterSolver<device_t, Parameters, Data>::collect_measurements() { } // sum G4 if (parameters_.accumulateG4() && dca_iteration_ == parameters_.get_dca_iterations() - 1) { for (int g4_idx = 0; g4_idx < data_.get_G4().size(); ++g4_idx) { auto& G4 = data_.get_G4()[g4_idx]; G4 = accumulator_.get_sign_times_G4()[g4_idx]; if (accumulate_g4) { for (int channel = 0; channel < parameters_.numG4Channels(); ++channel) { auto& G4 = data_.get_G4()[channel]; G4 = accumulator_.get_sign_times_G4()[channel]; if (compute_jack_knife_) concurrency_.leaveOneOutSum(G4, true); concurrency_.leaveOneOutSum(G4); else concurrency_.delayedSum(G4); // TODO: reduce only on rank 0. concurrency_.localSum(G4, concurrency_.first()); } } Loading @@ -466,8 +464,10 @@ void CtauxClusterSolver<device_t, Parameters, Data>::collect_measurements() { M_r_w_ /= accumulated_sign_; M_r_w_squared_ /= accumulated_sign_; if(accumulate_g4) { for (auto &G4 : data_.get_G4()) G4 /= accumulated_sign_ * parameters_.get_beta() * parameters_.get_beta(); } if (parameters_.additional_time_measurements()) { accumulator_.get_G_r_t() /= accumulated_sign_; Loading