Commit 9495bfe6 authored by gbalduzz's avatar gbalduzz
Browse files

Reduce G4 only on first rank.

parent 93ad2a75
Loading
Loading
Loading
Loading
+26 −4
Original line number Diff line number Diff line
@@ -59,6 +59,9 @@ public:
  template <typename scalar_type>
  void sum(linalg::Matrix<scalar_type, linalg::CPU>& f) const;

  template <typename Scalar, class Domain>
  void localSum(func::function<Scalar, Domain>& f, int root_id) const;

  template <typename Scalar>
  void delayedSum(Scalar& obj);
  template <typename Scalar>
@@ -152,8 +155,9 @@ public:
                                           const std::vector<int>& orders) const;

private:
  // Compute the sum on process id 'rank', or all processes if rank == -1.
  template <typename T>
  void sum(const T* in, T* out, std::size_t n) const;
  void sum(const T* in, T* out, std::size_t n, int rank = -1) const;

  template <typename T>
  void delayedSum(T* in, std::size_t n);
@@ -280,6 +284,18 @@ void MPICollectiveSum::sum(linalg::Matrix<scalar_type, linalg::CPU>& f) const {
  f = std::move(F);
}

template <typename scalar_type, class domain>
void MPICollectiveSum::localSum(func::function<scalar_type, domain>& f, int id) const {
  if (id < 0 || id > get_size())
    throw(std::out_of_range("id out of range."));

  func::function<scalar_type, domain> f_sum;

  sum(f.values(), f_sum.values(), f.size(), id);

  f = std::move(f_sum);
}

template <typename some_type>
void MPICollectiveSum::sum_and_average(some_type& obj, const int nr_meas_rank) const {
  sum(obj);
@@ -544,7 +560,7 @@ std::vector<Scalar> MPICollectiveSum::avgNormalizedMomenta(const func::function<
}

template <typename T>
void MPICollectiveSum::sum(const T* in, T* out, std::size_t n) const {
void MPICollectiveSum::sum(const T* in, T* out, std::size_t n, int id) const {
  // On summit large messages hangs if sizeof(floating point type) type * message_size > 2^31-1.
  constexpr std::size_t max_size = dca::util::IsComplex<T>::value
                                       ? 2 * (std::numeric_limits<int>::max() / sizeof(T))
@@ -552,9 +568,15 @@ void MPICollectiveSum::sum(const T* in, T* out, std::size_t n) const {

  for (std::size_t start = 0; start < n; start += max_size) {
    const int msg_size = std::min(n - start, max_size);
    if (id == -1) {
      MPI_Allreduce(in + start, out + start, msg_size, MPITypeMap<T>::value(), MPI_SUM,
                    MPIProcessorGrouping::get());
    }
    else {
      MPI_Reduce(in + start, out + start, msg_size, MPITypeMap<T>::value(), MPI_SUM, id,
                 MPIProcessorGrouping::get());
    }
  }
}

template <typename Scalar>
+3 −0
Original line number Diff line number Diff line
@@ -36,6 +36,9 @@ public:
  template <class T1, class T2>
  void sum(const T1&, T2&) const {}

  template<class T>
  void localSum(const T& , int ){}

  template <class T>
  void delayedSum(T&) const {}
  void resolveSums() const {}
+16 −16
Original line number Diff line number Diff line
@@ -280,15 +280,11 @@ double CtauxClusterSolver<device_t, Parameters, Data>::finalize(dca_info_struct_
    }
  }

  if (dca_iteration_ == parameters_.get_dca_iterations() - 1 && parameters_.accumulateG4()) {
    for (auto& G4_channel : data_.get_G4())
      G4_channel /= parameters_.get_beta() * parameters_.get_beta();

    if (compute_jack_knife_) {
      for (std::size_t channel = 0; channel < data_.get_G4().size(); ++channel)
  if (compute_jack_knife_ && parameters_.accumulateG4()) {
    for (std::size_t channel = 0; channel < parameters_.numG4Channels(); ++channel)
      data_.get_G4_error()[channel] = concurrency_.jackknifeError(data_.get_G4()[channel], true);
  }
  }

  double total = 1.e-6, integral = 0;

  for (int l = 0; l < accumulator_.get_visited_expansion_order_k().size(); l++) {
@@ -421,6 +417,8 @@ void CtauxClusterSolver<device_t, Parameters, Data>::collect_measurements() {
  };

  const double local_time = total_time_;
  const bool accumulate_g4 =
      parameters_.accumulateG4() && dca_iteration_ == parameters_.get_dca_iterations() - 1;

  {
    Profiler profiler("QMC-collectives", "CT-AUX solver", __LINE__);
@@ -447,14 +445,14 @@ void CtauxClusterSolver<device_t, Parameters, Data>::collect_measurements() {
    }

    // sum G4
    if (parameters_.accumulateG4() && dca_iteration_ == parameters_.get_dca_iterations() - 1) {
      for (int g4_idx = 0; g4_idx < data_.get_G4().size(); ++g4_idx) {
        auto& G4 = data_.get_G4()[g4_idx];
        G4 = accumulator_.get_sign_times_G4()[g4_idx];
    if (accumulate_g4) {
      for (int channel = 0; channel < parameters_.numG4Channels(); ++channel) {
        auto& G4 = data_.get_G4()[channel];
        G4 = accumulator_.get_sign_times_G4()[channel];
        if (compute_jack_knife_)
          concurrency_.leaveOneOutSum(G4, true);
          concurrency_.leaveOneOutSum(G4);
        else
          concurrency_.delayedSum(G4);  // TODO: reduce only on rank 0.
          concurrency_.localSum(G4, concurrency_.first());
      }
    }

@@ -466,8 +464,10 @@ void CtauxClusterSolver<device_t, Parameters, Data>::collect_measurements() {

  M_r_w_ /= accumulated_sign_;
  M_r_w_squared_ /= accumulated_sign_;
  if(accumulate_g4) {
      for (auto &G4 : data_.get_G4())
          G4 /= accumulated_sign_ * parameters_.get_beta() * parameters_.get_beta();
  }

  if (parameters_.additional_time_measurements()) {
    accumulator_.get_G_r_t() /= accumulated_sign_;