Commit 9d4e80e0 authored by gbalduzz's avatar gbalduzz
Browse files

Extended and fixed flop computation.

parent 6b241089
Loading
Loading
Loading
Loading
+9 −2
Original line number Diff line number Diff line
@@ -54,7 +54,8 @@ public:
  // order of M's labels from (r, b, w) to (b, r, w).
  // The transform is equivalent to M(k1, k2) = \sum_{r1, r2} exp(i(k1 * r1 - k2 * r2)) M(r1, r2)
  // In/Out: M
  void execute(RMatrix& M);
  // Returns: number of flop.
  float execute(RMatrix& M);

  void setWorkspace(const std::shared_ptr<RMatrix>& workspace) {
    workspace_ = workspace;
@@ -107,7 +108,9 @@ SpaceTransform2DGpu<RDmn, KDmn, Real>::SpaceTransform2DGpu(const int nw_pos, mag
}

template <class RDmn, class KDmn, typename Real>
void SpaceTransform2DGpu<RDmn, KDmn, Real>::execute(RMatrix& M) {
float SpaceTransform2DGpu<RDmn, KDmn, Real>::execute(RMatrix& M) {
  float flop = 0.;

  auto& T_times_M = *(workspace_);
  auto& T_times_M_times_T = M;

@@ -125,6 +128,7 @@ void SpaceTransform2DGpu<RDmn, KDmn, Real>::execute(RMatrix& M) {
    const int ldb = M.leadingDimension();
    const int ldc = T_times_M.leadingDimension();
    plan1_.execute('N', 'N', nc_, M.nrCols(), nc_, Complex(1), Complex(0), lda, ldb, ldc);
    flop += n_trafo * 8. * nc_ * M.nrCols() * nc_;
  }

  {
@@ -138,10 +142,13 @@ void SpaceTransform2DGpu<RDmn, KDmn, Real>::execute(RMatrix& M) {
    const int ldc = T_times_M_times_T.leadingDimension();
    const Complex norm(1. / nc_);
    plan2_.execute('N', 'C', M.nrRows(), nc_, nc_, norm, Complex(0), lda, ldb, ldc);
    flop += n_trafo * 8. * M.nrRows() * nc_ * nc_;
  }

  phaseFactorsAndRearrange(T_times_M_times_T, *workspace_);
  M.swap(*workspace_);

  return flop;
}

template <class RDmn, class KDmn, typename Real>
+3 −3
Original line number Diff line number Diff line
@@ -185,8 +185,8 @@ protected:
  using MC_accumulator_data::DCA_iteration;
  using MC_accumulator_data::number_of_measurements;

  using MC_accumulator_data::current_sign;
  using MC_accumulator_data::accumulated_sign;
  using MC_accumulator_data::current_sign;

  const bool compute_std_deviation_;

@@ -464,12 +464,12 @@ void CtauxAccumulator<device_t, Parameters, Data>::accumulate_equal_time_quantit
template <dca::linalg::DeviceType device_t, class Parameters, class Data>
void CtauxAccumulator<device_t, Parameters, Data>::accumulate_two_particle_quantities() {
  profiler_type profiler("tp-accumulation", "CT-AUX accumulator", __LINE__, thread_id);
  /*GFLOP +=*/two_particle_accumulator_.accumulate(M_, hs_configuration_, current_sign);
  GFLOP += 1e-9 * two_particle_accumulator_.accumulate(M_, hs_configuration_, current_sign);
}

template <dca::linalg::DeviceType device_t, class Parameters, class Data>
void CtauxAccumulator<device_t, Parameters, Data>::sumTo(this_type& other) {
  other.get_Gflop() += get_Gflop();
  other.GFLOP += GFLOP;

  other.accumulated_sign += accumulated_sign;
  other.number_of_measurements += number_of_measurements;
+5 −2
Original line number Diff line number Diff line
@@ -420,6 +420,7 @@ void CtauxClusterSolver<device_t, Parameters, Data>::collect_measurements() {
      concurrency_.sum(f);
  };

  const double local_time = total_time_;
  {
    Profiler profiler("Scalars", "QMC-collectives", __LINE__);
    concurrency_.sum(total_time_);
@@ -430,8 +431,10 @@ void CtauxClusterSolver<device_t, Parameters, Data>::collect_measurements() {

  if (concurrency_.id() == concurrency_.first())
    std::cout << "\n\t\t Collect measurements \t" << dca::util::print_time() << "\n"
              << "\n\t\t\t QMC-time : " << total_time_ << " [sec]"
              << "\n\t\t\t Gflops   : " << accumulator_.get_Gflop() / total_time_ << " [Gf]"
              << "\n\t\t\t QMC-local-time : " << local_time << " [sec]"
              << "\n\t\t\t QMC-total-time : " << total_time_ << " [sec]"
              << "\n\t\t\t Gflop   : " << accumulator_.get_Gflop() << " [Gf]"
              << "\n\t\t\t Gflop/s   : " << accumulator_.get_Gflop() / local_time << " [Gf/s]"
              << "\n\t\t\t sign     : " << accumulated_sign_ / parameters_.get_measurements()
              << " \n";

+1 −1
Original line number Diff line number Diff line
@@ -33,7 +33,7 @@ void computeGMultiband(std::complex<Real>* G, int ldg, const std::complex<Real>*
                       int nb, int nk, int nw_pos, Real beta, cudaStream_t stream);

template <typename Real, FourPointType type>
void updateG4(std::complex<Real>* G4, const std::complex<Real>* G_up, const int lggu,
float updateG4(std::complex<Real>* G4, const std::complex<Real>* G_up, const int lggu,
              const std::complex<Real>* G_down, const int ldgd, const int nb, const int nk,
              const int nw_pos, const int nw_exchange, const int nk_exchange, const int sign,
              bool atomic, cudaStream_t stream);
+11 −11
Original line number Diff line number Diff line
@@ -52,7 +52,7 @@ public:
  // Returns: the number of gigaflops performed by the method.
  // TODO: remove the gigaflops if they are not necessary.
  template <class Configuration, typename ScalarInp, class OutDmn>
  double execute(const Configuration& configuration, const linalg::Matrix<ScalarInp, linalg::CPU>& M,
  float execute(const Configuration& configuration, const linalg::Matrix<ScalarInp, linalg::CPU>& M,
                 func::function<std::complex<ScalarType>, OutDmn>& M_r_r_w_w, int spin = 0);

private:
@@ -64,7 +64,7 @@ private:

  void computeTSubmatrices(int orb_i, int orb_j);

  double executeTrimmedFT();
  float executeTrimmedFT();

  void inline copyPartialResult(
      int orb1, int orb2, int /*spin*/,
@@ -110,12 +110,12 @@ private:

template <typename ScalarType, class RDmn, class WDmn, class WPosDmn, bool non_density_density>
template <class Configuration, typename ScalarInp, class OutDmn>
double CachedNdft<ScalarType, RDmn, WDmn, WPosDmn, linalg::CPU, non_density_density>::execute(
float CachedNdft<ScalarType, RDmn, WDmn, WPosDmn, linalg::CPU, non_density_density>::execute(
    const Configuration& configuration, const linalg::Matrix<ScalarInp, linalg::CPU>& M,
    func::function<std::complex<ScalarType>, OutDmn>& M_r_r_w_w, const int spin) {
  assert(M_r_r_w_w[M_r_r_w_w.signature() - 1] == WDmn::dmn_size());
  assert(M_r_r_w_w[M_r_r_w_w.signature() - 2] == WPosDmn::dmn_size());
  double gflop = 0.;
  double flops = 0.;

  BaseClass::sortConfiguration(configuration);

@@ -132,7 +132,7 @@ double CachedNdft<ScalarType, RDmn, WDmn, WPosDmn, linalg::CPU, non_density_dens

        computeTSubmatrices(orb_i, orb_j);

        gflop += executeTrimmedFT();
        flops += executeTrimmedFT();

        copyPartialResult(orb_i, orb_j, spin, M_r_r_w_w);
      }
@@ -141,7 +141,7 @@ double CachedNdft<ScalarType, RDmn, WDmn, WPosDmn, linalg::CPU, non_density_dens
    }
  }

  return gflop;
  return flops;
}

template <typename ScalarType, class RDmn, class WDmn, class WPosDmn, bool non_density_density>
@@ -209,8 +209,8 @@ void CachedNdft<ScalarType, RDmn, WDmn, WPosDmn, linalg::CPU, non_density_densit
}

template <typename ScalarType, class RDmn, class WDmn, class WPosDmn, bool non_density_density>
double CachedNdft<ScalarType, RDmn, WDmn, WPosDmn, linalg::CPU, non_density_density>::executeTrimmedFT() {
  double flop = 0.;
float CachedNdft<ScalarType, RDmn, WDmn, WPosDmn, linalg::CPU, non_density_density>::executeTrimmedFT() {
  float flops = 0.;

  assert(WPosDmn::dmn_size() == WDmn::dmn_size() / 2);

@@ -227,13 +227,13 @@ double CachedNdft<ScalarType, RDmn, WDmn, WPosDmn, linalg::CPU, non_density_dens
  }

  dca::linalg::matrixop::multiply(T_l_, M_ij_, T_l_times_M_ij_);
  flop += 4 * T_l_[0].size().first * T_l_[0].size().second * M_ij_.size().second;
  flops += 4 * T_l_[0].size().first * T_l_[0].size().second * M_ij_.size().second;

  dca::linalg::matrixop::multiply('N', 'C', T_l_times_M_ij_, T_r_, T_l_times_M_ij_times_T_r_, work_);
  flop += 4. * T_l_times_M_ij_[0].size().first * T_l_times_M_ij_[0].size().second *
  flops += 8. * T_l_times_M_ij_[0].size().first * T_l_times_M_ij_[0].size().second *
          T_l_times_M_ij_times_T_r_[0].size().second;

  return 1e-9 * flop;
  return flops;
}

template <typename ScalarType, class RDmn, class WDmn, class WPosDmn, bool non_density_density>
Loading