Loading include/dca/phys/dca_step/cluster_solver/ctaux/ctaux_accumulator.hpp +12 −10 Original line number Diff line number Diff line Loading @@ -54,7 +54,8 @@ namespace solver { namespace ctaux { // dca::phys::solver::ctaux:: template <dca::linalg::DeviceType device_t, class Parameters, class Data, DistType DIST = dca::DistType::NONE, typename Real = double> template <dca::linalg::DeviceType device_t, class Parameters, class Data, DistType DIST = dca::DistType::NONE, typename Real = double> class CtauxAccumulator : public MC_accumulator_data { public: using this_type = CtauxAccumulator<device_t, Parameters, Data, DIST, Real>; Loading Loading @@ -95,7 +96,7 @@ public: template <typename walker_type> void updateFrom(walker_type& walker); void measure(); void measure(const int meas_id = -1); // Sums all accumulated objects of this accumulator to the equivalent objects // of the 'other' accumulator. Loading Loading @@ -173,7 +174,7 @@ private: void accumulate_equal_time_quantities(const std::array<linalg::Matrix<Real, linalg::GPU>, 2>& M); void accumulate_equal_time_quantities(const std::array<linalg::Matrix<Real, linalg::CPU>, 2>& M); void accumulate_two_particle_quantities(); void accumulate_two_particle_quantities(const int meas_id = -1); protected: const Parameters& parameters_; Loading Loading @@ -212,8 +213,8 @@ protected: }; template <dca::linalg::DeviceType device_t, class Parameters, class Data, DistType DIST, typename Real> CtauxAccumulator<device_t, Parameters, Data, DIST, Real>::CtauxAccumulator(const Parameters& parameters_ref, Data& data_ref, int id) CtauxAccumulator<device_t, Parameters, Data, DIST, Real>::CtauxAccumulator( const Parameters& parameters_ref, Data& data_ref, int id) : MC_accumulator_data(), parameters_(parameters_ref), Loading Loading @@ -365,12 +366,12 @@ void CtauxAccumulator<device_t, Parameters, Data, DIST, Real>::updateFrom(walker } template <dca::linalg::DeviceType device_t, class Parameters, class Data, DistType DIST, typename Real> void CtauxAccumulator<device_t, Parameters, Data, DIST, Real>::measure() { void CtauxAccumulator<device_t, Parameters, Data, DIST, Real>::measure(const int meas_id) { number_of_measurements += 1; accumulated_sign += current_sign; if (perform_tp_accumulation_) accumulate_two_particle_quantities(); accumulate_two_particle_quantities(meas_id); accumulate_single_particle_quantities(); Loading Loading @@ -467,9 +468,10 @@ void CtauxAccumulator<device_t, Parameters, Data, DIST, Real>::accumulate_equal_ *************************************************************/ template <dca::linalg::DeviceType device_t, class Parameters, class Data, DistType DIST, typename Real> void CtauxAccumulator<device_t, Parameters, Data, DIST, Real>::accumulate_two_particle_quantities() { void CtauxAccumulator<device_t, Parameters, Data, DIST, Real>::accumulate_two_particle_quantities( const int meas_id) { profiler_type profiler("tp-accumulation", "CT-AUX accumulator", __LINE__, thread_id); GFLOP += 1e-9 * two_particle_accumulator_.accumulate(M_, hs_configuration_, current_sign); GFLOP += 1e-9 * two_particle_accumulator_.accumulate(M_, hs_configuration_, current_sign, meas_id); } template <dca::linalg::DeviceType device_t, class Parameters, class Data, DistType DIST, typename Real> Loading include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_gpu.hpp +4 −4 Original line number Diff line number Diff line Loading @@ -73,12 +73,12 @@ public: // Returns: number of flop. template <class Configuration, typename RealIn> float accumulate(const std::array<linalg::Matrix<RealIn, linalg::GPU>, 2>& M, const std::array<Configuration, 2>& configs, int sign); const std::array<Configuration, 2>& configs, int sign, const int meas_id = -1); // CPU input. For testing purposes. template <class Configuration> float accumulate(const std::array<linalg::Matrix<double, linalg::CPU>, 2>& M, const std::array<Configuration, 2>& configs, int sign); const std::array<Configuration, 2>& configs, int sign, const int meas_id = -1); // Downloads the accumulation result to the host. void finalize(); Loading Loading @@ -296,7 +296,7 @@ template <class Parameters> template <class Configuration, typename RealIn> float TpAccumulator<Parameters, linalg::GPU, DistType::NONE>::accumulate( const std::array<linalg::Matrix<RealIn, linalg::GPU>, 2>& M, const std::array<Configuration, 2>& configs, const int sign) { const std::array<Configuration, 2>& configs, const int sign, const int meas_id) { Profiler profiler("accumulate", "tp-accumulation", __LINE__, thread_id_); float flop = 0; Loading @@ -321,7 +321,7 @@ template <class Parameters> template <class Configuration> float TpAccumulator<Parameters, linalg::GPU>::accumulate( const std::array<linalg::Matrix<double, linalg::CPU>, 2>& M, const std::array<Configuration, 2>& configs, const int sign) { const std::array<Configuration, 2>& configs, const int sign, const int meas_id) { std::array<linalg::Matrix<double, linalg::GPU>, 2> M_dev; for (int s = 0; s < 2; ++s) M_dev[s].setAsync(M[s], queues_[0]); Loading include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_mpi_gpu.hpp +10 −10 Original line number Diff line number Diff line Loading @@ -77,12 +77,12 @@ private: public: template <class Configuration, typename RealIn> float accumulate(const std::array<linalg::Matrix<RealIn, linalg::GPU>, 2>& M, const std::array<Configuration, 2>& configs, int sign); const std::array<Configuration, 2>& configs, int sign, const int meas_id = -1); // CPU input. For testing purposes. template <class Configuration> float accumulate(const std::array<linalg::Matrix<double, linalg::CPU>, 2>& M, const std::array<Configuration, 2>& configs, int sign); const std::array<Configuration, 2>& configs, int sign, const int meas_id = -1); // Downloads the accumulation result to the host. void finalize(); Loading Loading @@ -123,10 +123,10 @@ private: std::array<MPI_Request, 2> recv_requests_{MPI_REQUEST_NULL, MPI_REQUEST_NULL}; std::array<MPI_Request, 2> send_requests_{MPI_REQUEST_NULL, MPI_REQUEST_NULL}; #ifndef DCA_HAVE_CUDA_AWARE_MPI #ifndef DCA_WITH_CUDA_AWARE_MPI std::array<std::vector<Complex>, 2> sendbuffer_; std::array<std::vector<Complex>, 2> recvbuffer_; #endif // DCA_HAVE_CUDA_AWARE_MPI #endif // DCA_WITH_CUDA_AWARE_MPI }; template <class Parameters> Loading Loading @@ -162,7 +162,7 @@ template <class Parameters> template <class Configuration, typename RealIn> float TpAccumulator<Parameters, linalg::GPU, DistType::MPI>::accumulate( const std::array<linalg::Matrix<RealIn, linalg::GPU>, 2>& M, const std::array<Configuration, 2>& configs, const int sign) { const std::array<Configuration, 2>& configs, const int sign, const int meas_id) { // typename BaseClass::Profiler profiler("accumulate", "tp-accumulation", __LINE__, BaseClass::thread_id_); float flop = 0; Loading @@ -189,7 +189,7 @@ template <class Parameters> template <class Configuration> float TpAccumulator<Parameters, linalg::GPU, DistType::MPI>::accumulate( const std::array<linalg::Matrix<double, linalg::CPU>, 2>& M, const std::array<Configuration, 2>& configs, const int sign) { const std::array<Configuration, 2>& configs, const int sign, const int meas_id) { std::array<linalg::Matrix<double, linalg::GPU>, 2> M_dev; for (int s = 0; s < 2; ++s) M_dev[s].setAsync(M[s], queues_[0]); Loading Loading @@ -366,7 +366,7 @@ void TpAccumulator<Parameters, linalg::GPU, DistType::MPI>::send(const std::arra using dca::parallel::MPITypeMap; const auto g_size = data[0].size().first * data[0].size().second; #ifdef DCA_HAVE_CUDA_AWARE_MPI #ifdef DCA_WITH_CUDA_AWARE_MPI for (int s = 0; s < 2; ++s) { MPI_Isend(data[s].ptr(), g_size, MPITypeMap<Complex>::value(), target, thread_id_ + 1, MPI_COMM_WORLD, &request[s]); Loading @@ -381,7 +381,7 @@ void TpAccumulator<Parameters, linalg::GPU, DistType::MPI>::send(const std::arra MPI_Isend(sendbuffer_[s].data(), g_size, MPITypeMap<Complex>::value(), target, thread_id_ + 1, MPI_COMM_WORLD, &request[s]); } #endif // DCA_HAVE_CUDA_AWARE_MPI #endif // DCA_WITH_CUDA_AWARE_MPI } template <class Parameters> Loading @@ -390,7 +390,7 @@ void TpAccumulator<Parameters, linalg::GPU, DistType::MPI>::receive( using dca::parallel::MPITypeMap; const auto g_size = data[0].size().first * data[0].size().second; #ifdef DCA_HAVE_CUDA_AWARE_MPI #ifdef DCA_WITH_CUDA_AWARE_MPI for (int s = 0; s < 2; ++s) { MPI_Irecv(data[s].ptr(), g_size, MPITypeMap<Complex>::value(), source, thread_id_ + 1, MPI_COMM_WORLD, &request[s]); Loading @@ -408,7 +408,7 @@ void TpAccumulator<Parameters, linalg::GPU, DistType::MPI>::receive( cudaMemcpy(data[s].ptr(), recvbuffer_[s].data(), g_size * sizeof(Complex), cudaMemcpyHostToDevice); } #endif // DCA_HAVE_CUDA_AWARE_MPI #endif // DCA_WITH_CUDA_AWARE_MPI } } // namespace accumulator Loading include/dca/phys/dca_step/cluster_solver/stdthread_qmci/stdthread_qmci_cluster_solver.hpp +1 −1 Original line number Diff line number Diff line Loading @@ -425,7 +425,7 @@ void StdThreadQmciClusterSolver<QmciSolver>::startWalkerAndAccumulator(int id) { { Profiler profiler("Accumulator measuring", "stdthread-MC", __LINE__, id); accumulator_obj.updateFrom(walker); accumulator_obj.measure(); accumulator_obj.measure(meas_id); } if (print) walker.updateShell(meas_id, n_meas); Loading Loading
include/dca/phys/dca_step/cluster_solver/ctaux/ctaux_accumulator.hpp +12 −10 Original line number Diff line number Diff line Loading @@ -54,7 +54,8 @@ namespace solver { namespace ctaux { // dca::phys::solver::ctaux:: template <dca::linalg::DeviceType device_t, class Parameters, class Data, DistType DIST = dca::DistType::NONE, typename Real = double> template <dca::linalg::DeviceType device_t, class Parameters, class Data, DistType DIST = dca::DistType::NONE, typename Real = double> class CtauxAccumulator : public MC_accumulator_data { public: using this_type = CtauxAccumulator<device_t, Parameters, Data, DIST, Real>; Loading Loading @@ -95,7 +96,7 @@ public: template <typename walker_type> void updateFrom(walker_type& walker); void measure(); void measure(const int meas_id = -1); // Sums all accumulated objects of this accumulator to the equivalent objects // of the 'other' accumulator. Loading Loading @@ -173,7 +174,7 @@ private: void accumulate_equal_time_quantities(const std::array<linalg::Matrix<Real, linalg::GPU>, 2>& M); void accumulate_equal_time_quantities(const std::array<linalg::Matrix<Real, linalg::CPU>, 2>& M); void accumulate_two_particle_quantities(); void accumulate_two_particle_quantities(const int meas_id = -1); protected: const Parameters& parameters_; Loading Loading @@ -212,8 +213,8 @@ protected: }; template <dca::linalg::DeviceType device_t, class Parameters, class Data, DistType DIST, typename Real> CtauxAccumulator<device_t, Parameters, Data, DIST, Real>::CtauxAccumulator(const Parameters& parameters_ref, Data& data_ref, int id) CtauxAccumulator<device_t, Parameters, Data, DIST, Real>::CtauxAccumulator( const Parameters& parameters_ref, Data& data_ref, int id) : MC_accumulator_data(), parameters_(parameters_ref), Loading Loading @@ -365,12 +366,12 @@ void CtauxAccumulator<device_t, Parameters, Data, DIST, Real>::updateFrom(walker } template <dca::linalg::DeviceType device_t, class Parameters, class Data, DistType DIST, typename Real> void CtauxAccumulator<device_t, Parameters, Data, DIST, Real>::measure() { void CtauxAccumulator<device_t, Parameters, Data, DIST, Real>::measure(const int meas_id) { number_of_measurements += 1; accumulated_sign += current_sign; if (perform_tp_accumulation_) accumulate_two_particle_quantities(); accumulate_two_particle_quantities(meas_id); accumulate_single_particle_quantities(); Loading Loading @@ -467,9 +468,10 @@ void CtauxAccumulator<device_t, Parameters, Data, DIST, Real>::accumulate_equal_ *************************************************************/ template <dca::linalg::DeviceType device_t, class Parameters, class Data, DistType DIST, typename Real> void CtauxAccumulator<device_t, Parameters, Data, DIST, Real>::accumulate_two_particle_quantities() { void CtauxAccumulator<device_t, Parameters, Data, DIST, Real>::accumulate_two_particle_quantities( const int meas_id) { profiler_type profiler("tp-accumulation", "CT-AUX accumulator", __LINE__, thread_id); GFLOP += 1e-9 * two_particle_accumulator_.accumulate(M_, hs_configuration_, current_sign); GFLOP += 1e-9 * two_particle_accumulator_.accumulate(M_, hs_configuration_, current_sign, meas_id); } template <dca::linalg::DeviceType device_t, class Parameters, class Data, DistType DIST, typename Real> Loading
include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_gpu.hpp +4 −4 Original line number Diff line number Diff line Loading @@ -73,12 +73,12 @@ public: // Returns: number of flop. template <class Configuration, typename RealIn> float accumulate(const std::array<linalg::Matrix<RealIn, linalg::GPU>, 2>& M, const std::array<Configuration, 2>& configs, int sign); const std::array<Configuration, 2>& configs, int sign, const int meas_id = -1); // CPU input. For testing purposes. template <class Configuration> float accumulate(const std::array<linalg::Matrix<double, linalg::CPU>, 2>& M, const std::array<Configuration, 2>& configs, int sign); const std::array<Configuration, 2>& configs, int sign, const int meas_id = -1); // Downloads the accumulation result to the host. void finalize(); Loading Loading @@ -296,7 +296,7 @@ template <class Parameters> template <class Configuration, typename RealIn> float TpAccumulator<Parameters, linalg::GPU, DistType::NONE>::accumulate( const std::array<linalg::Matrix<RealIn, linalg::GPU>, 2>& M, const std::array<Configuration, 2>& configs, const int sign) { const std::array<Configuration, 2>& configs, const int sign, const int meas_id) { Profiler profiler("accumulate", "tp-accumulation", __LINE__, thread_id_); float flop = 0; Loading @@ -321,7 +321,7 @@ template <class Parameters> template <class Configuration> float TpAccumulator<Parameters, linalg::GPU>::accumulate( const std::array<linalg::Matrix<double, linalg::CPU>, 2>& M, const std::array<Configuration, 2>& configs, const int sign) { const std::array<Configuration, 2>& configs, const int sign, const int meas_id) { std::array<linalg::Matrix<double, linalg::GPU>, 2> M_dev; for (int s = 0; s < 2; ++s) M_dev[s].setAsync(M[s], queues_[0]); Loading
include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_mpi_gpu.hpp +10 −10 Original line number Diff line number Diff line Loading @@ -77,12 +77,12 @@ private: public: template <class Configuration, typename RealIn> float accumulate(const std::array<linalg::Matrix<RealIn, linalg::GPU>, 2>& M, const std::array<Configuration, 2>& configs, int sign); const std::array<Configuration, 2>& configs, int sign, const int meas_id = -1); // CPU input. For testing purposes. template <class Configuration> float accumulate(const std::array<linalg::Matrix<double, linalg::CPU>, 2>& M, const std::array<Configuration, 2>& configs, int sign); const std::array<Configuration, 2>& configs, int sign, const int meas_id = -1); // Downloads the accumulation result to the host. void finalize(); Loading Loading @@ -123,10 +123,10 @@ private: std::array<MPI_Request, 2> recv_requests_{MPI_REQUEST_NULL, MPI_REQUEST_NULL}; std::array<MPI_Request, 2> send_requests_{MPI_REQUEST_NULL, MPI_REQUEST_NULL}; #ifndef DCA_HAVE_CUDA_AWARE_MPI #ifndef DCA_WITH_CUDA_AWARE_MPI std::array<std::vector<Complex>, 2> sendbuffer_; std::array<std::vector<Complex>, 2> recvbuffer_; #endif // DCA_HAVE_CUDA_AWARE_MPI #endif // DCA_WITH_CUDA_AWARE_MPI }; template <class Parameters> Loading Loading @@ -162,7 +162,7 @@ template <class Parameters> template <class Configuration, typename RealIn> float TpAccumulator<Parameters, linalg::GPU, DistType::MPI>::accumulate( const std::array<linalg::Matrix<RealIn, linalg::GPU>, 2>& M, const std::array<Configuration, 2>& configs, const int sign) { const std::array<Configuration, 2>& configs, const int sign, const int meas_id) { // typename BaseClass::Profiler profiler("accumulate", "tp-accumulation", __LINE__, BaseClass::thread_id_); float flop = 0; Loading @@ -189,7 +189,7 @@ template <class Parameters> template <class Configuration> float TpAccumulator<Parameters, linalg::GPU, DistType::MPI>::accumulate( const std::array<linalg::Matrix<double, linalg::CPU>, 2>& M, const std::array<Configuration, 2>& configs, const int sign) { const std::array<Configuration, 2>& configs, const int sign, const int meas_id) { std::array<linalg::Matrix<double, linalg::GPU>, 2> M_dev; for (int s = 0; s < 2; ++s) M_dev[s].setAsync(M[s], queues_[0]); Loading Loading @@ -366,7 +366,7 @@ void TpAccumulator<Parameters, linalg::GPU, DistType::MPI>::send(const std::arra using dca::parallel::MPITypeMap; const auto g_size = data[0].size().first * data[0].size().second; #ifdef DCA_HAVE_CUDA_AWARE_MPI #ifdef DCA_WITH_CUDA_AWARE_MPI for (int s = 0; s < 2; ++s) { MPI_Isend(data[s].ptr(), g_size, MPITypeMap<Complex>::value(), target, thread_id_ + 1, MPI_COMM_WORLD, &request[s]); Loading @@ -381,7 +381,7 @@ void TpAccumulator<Parameters, linalg::GPU, DistType::MPI>::send(const std::arra MPI_Isend(sendbuffer_[s].data(), g_size, MPITypeMap<Complex>::value(), target, thread_id_ + 1, MPI_COMM_WORLD, &request[s]); } #endif // DCA_HAVE_CUDA_AWARE_MPI #endif // DCA_WITH_CUDA_AWARE_MPI } template <class Parameters> Loading @@ -390,7 +390,7 @@ void TpAccumulator<Parameters, linalg::GPU, DistType::MPI>::receive( using dca::parallel::MPITypeMap; const auto g_size = data[0].size().first * data[0].size().second; #ifdef DCA_HAVE_CUDA_AWARE_MPI #ifdef DCA_WITH_CUDA_AWARE_MPI for (int s = 0; s < 2; ++s) { MPI_Irecv(data[s].ptr(), g_size, MPITypeMap<Complex>::value(), source, thread_id_ + 1, MPI_COMM_WORLD, &request[s]); Loading @@ -408,7 +408,7 @@ void TpAccumulator<Parameters, linalg::GPU, DistType::MPI>::receive( cudaMemcpy(data[s].ptr(), recvbuffer_[s].data(), g_size * sizeof(Complex), cudaMemcpyHostToDevice); } #endif // DCA_HAVE_CUDA_AWARE_MPI #endif // DCA_WITH_CUDA_AWARE_MPI } } // namespace accumulator Loading
include/dca/phys/dca_step/cluster_solver/stdthread_qmci/stdthread_qmci_cluster_solver.hpp +1 −1 Original line number Diff line number Diff line Loading @@ -425,7 +425,7 @@ void StdThreadQmciClusterSolver<QmciSolver>::startWalkerAndAccumulator(int id) { { Profiler profiler("Accumulator measuring", "stdthread-MC", __LINE__, id); accumulator_obj.updateFrom(walker); accumulator_obj.measure(); accumulator_obj.measure(meas_id); } if (print) walker.updateShell(meas_id, n_meas); Loading