Loading include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_mpi_gpu.hpp +56 −43 Original line number Diff line number Diff line Loading @@ -69,6 +69,8 @@ private: uint64_t start_; uint64_t end_; int my_rank = 0; int mpi_size = 1; // Eventually distribution strategy should be pushed down into linalg::Vector but // I think generalization should still wait. Loading Loading @@ -108,7 +110,10 @@ private: // BaseClass::TpDomain had changed this is no longer true as the accessible size of G4 is set in the constructor. void resetG4(); // Applies pipepline ring algorithm to move G matrices around all ranks void ringG(float& flop); void ringG(float& flop, const int meas_id); void perform_one_communication_step(float& flop); float updateG4(const std::size_t channel_index); void send(const std::array<RMatrix, 2>& data, int target, std::array<MPI_Request, 2>& request); Loading @@ -118,15 +123,15 @@ private: using BaseClass::G4_; // send buffer for pipeline ring algorithm std::array<RMatrix, 2> sendbuff_G_; std::array<RMatrix, 2> odd_sendbuff_G_; std::array<MPI_Request, 2> recv_requests_{MPI_REQUEST_NULL, MPI_REQUEST_NULL}; std::array<MPI_Request, 2> send_requests_{MPI_REQUEST_NULL, MPI_REQUEST_NULL}; #ifndef DCA_WITH_CUDA_AWARE_MPI #ifndef DCA_HAVE_CUDA_AWARE_MPI std::array<std::vector<Complex>, 2> sendbuffer_; std::array<std::vector<Complex>, 2> recvbuffer_; #endif // DCA_WITH_CUDA_AWARE_MPI #endif // DCA_HAVE_CUDA_AWARE_MPI }; template <class Parameters> Loading @@ -138,7 +143,6 @@ TpAccumulator<Parameters, linalg::GPU, DistType::MPI>::TpAccumulator( typename BaseClass::TpDomain tp_dmn; std::size_t local_g4_size = tp_dmn.get_size(); int my_rank = 0, mpi_size = 1; MPI_Comm_size(MPI_COMM_WORLD, &mpi_size); MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); local_g4_size = dca::util::ceilDiv(local_g4_size, std::size_t(mpi_size)); Loading Loading @@ -180,7 +184,7 @@ float TpAccumulator<Parameters, linalg::GPU, DistType::MPI>::accumulate( flop += updateG4(channel); } ringG(flop); ringG(flop, meas_id); return flop; } Loading Loading @@ -265,10 +269,6 @@ float TpAccumulator<Parameters, linalg::GPU, DistType::MPI>::updateG4(const std: const FourPointType channel = BaseClass::channels_[channel_index]; int my_rank, mpi_size; MPI_Comm_size(MPI_COMM_WORLD, &mpi_size); MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); switch (channel) { case PARTICLE_HOLE_TRANSVERSE: return details::updateG4<Real, PARTICLE_HOLE_TRANSVERSE>( Loading Loading @@ -306,20 +306,10 @@ float TpAccumulator<Parameters, linalg::GPU, DistType::MPI>::updateG4(const std: } template <class Parameters> void TpAccumulator<Parameters, linalg::GPU, DistType::MPI>::ringG(float& flop) { // get ready for send and receive for (int s = 0; s < 2; ++s) { sendbuff_G_[s] = G_[s]; } int my_concurrency_id, mpi_size; MPI_Comm_size(MPI_COMM_WORLD, &mpi_size); MPI_Comm_rank(MPI_COMM_WORLD, &my_concurrency_id); void TpAccumulator<Parameters, linalg::GPU, DistType::MPI>::perform_one_communication_step(float& flop) { // get rank index of left and right neighbor int left_neighbor = (my_concurrency_id - 1 + mpi_size) % mpi_size; int right_neighbor = (my_concurrency_id + 1 + mpi_size) % mpi_size; int left_neighbor = (my_rank - 1 + mpi_size) % mpi_size; int right_neighbor = (my_rank + 1 + mpi_size) % mpi_size; // Pipepline ring algorithm in the following for-loop: // 1) At each time step, local rank receives a new G2 from left hand neighbor, Loading @@ -329,8 +319,8 @@ void TpAccumulator<Parameters, linalg::GPU, DistType::MPI>::ringG(float& flop) { // a) #walker == #accumulator and shared-walk-and-accumulation-thread = true; // b) and, local measurements are equal, and each accumulator should have same #measurement, i.e. // measurements % ranks == 0 && local_measurement % threads == 0. for (int icount = 0; icount < (mpi_size - 1); icount++) { send(sendbuff_G_, right_neighbor, send_requests_); send(odd_sendbuff_G_, right_neighbor, send_requests_); receive(G_, left_neighbor, recv_requests_); // wait for G2 to be available again Loading @@ -348,9 +338,32 @@ void TpAccumulator<Parameters, linalg::GPU, DistType::MPI>::ringG(float& flop) { // get ready for send again for (int s = 0; s < 2; ++s) { sendbuff_G_[s].swap(G_[s]); odd_sendbuff_G_[s].swap(G_[s]); } } template <class Parameters> void TpAccumulator<Parameters, linalg::GPU, DistType::MPI>::ringG(float& flop, const int meas_id) { auto is_odd = [](int const i) -> bool { return ((i % 2) == 1); }; bool const perform_bidirectional_ring = is_odd(meas_id); // if (perform_bidirectional_ring == true) { // get ready for current send and receive (i.e. meas_id = 1, 3, 5...) for (int s = 0; s < 2; ++s) { odd_sendbuff_G_[s] = G_[s]; } for (int icount = 0; icount < (mpi_size - 1); icount++) { perform_one_communication_step(flop); } // } // else { // // store G2 in even measurement step (i.e. meas_id = 0, 2, 4...) as a previous G2 // for (int s = 0; s < 2; ++s) { // even_G_[s] = G_[s]; // even_sendbuff_G_[s] = G_[s]; // } // } } template <class Parameters> Loading @@ -366,7 +379,7 @@ void TpAccumulator<Parameters, linalg::GPU, DistType::MPI>::send(const std::arra using dca::parallel::MPITypeMap; const auto g_size = data[0].size().first * data[0].size().second; #ifdef DCA_WITH_CUDA_AWARE_MPI #ifdef DCA_HAVE_CUDA_AWARE_MPI for (int s = 0; s < 2; ++s) { MPI_Isend(data[s].ptr(), g_size, MPITypeMap<Complex>::value(), target, thread_id_ + 1, MPI_COMM_WORLD, &request[s]); Loading @@ -381,7 +394,7 @@ void TpAccumulator<Parameters, linalg::GPU, DistType::MPI>::send(const std::arra MPI_Isend(sendbuffer_[s].data(), g_size, MPITypeMap<Complex>::value(), target, thread_id_ + 1, MPI_COMM_WORLD, &request[s]); } #endif // DCA_WITH_CUDA_AWARE_MPI #endif // DCA_HAVE_CUDA_AWARE_MPI } template <class Parameters> Loading @@ -390,7 +403,7 @@ void TpAccumulator<Parameters, linalg::GPU, DistType::MPI>::receive( using dca::parallel::MPITypeMap; const auto g_size = data[0].size().first * data[0].size().second; #ifdef DCA_WITH_CUDA_AWARE_MPI #ifdef DCA_HAVE_CUDA_AWARE_MPI for (int s = 0; s < 2; ++s) { MPI_Irecv(data[s].ptr(), g_size, MPITypeMap<Complex>::value(), source, thread_id_ + 1, MPI_COMM_WORLD, &request[s]); Loading @@ -408,7 +421,7 @@ void TpAccumulator<Parameters, linalg::GPU, DistType::MPI>::receive( cudaMemcpy(data[s].ptr(), recvbuffer_[s].data(), g_size * sizeof(Complex), cudaMemcpyHostToDevice); } #endif // DCA_WITH_CUDA_AWARE_MPI #endif // DCA_HAVE_CUDA_AWARE_MPI } } // namespace accumulator Loading Loading
include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_mpi_gpu.hpp +56 −43 Original line number Diff line number Diff line Loading @@ -69,6 +69,8 @@ private: uint64_t start_; uint64_t end_; int my_rank = 0; int mpi_size = 1; // Eventually distribution strategy should be pushed down into linalg::Vector but // I think generalization should still wait. Loading Loading @@ -108,7 +110,10 @@ private: // BaseClass::TpDomain had changed this is no longer true as the accessible size of G4 is set in the constructor. void resetG4(); // Applies pipepline ring algorithm to move G matrices around all ranks void ringG(float& flop); void ringG(float& flop, const int meas_id); void perform_one_communication_step(float& flop); float updateG4(const std::size_t channel_index); void send(const std::array<RMatrix, 2>& data, int target, std::array<MPI_Request, 2>& request); Loading @@ -118,15 +123,15 @@ private: using BaseClass::G4_; // send buffer for pipeline ring algorithm std::array<RMatrix, 2> sendbuff_G_; std::array<RMatrix, 2> odd_sendbuff_G_; std::array<MPI_Request, 2> recv_requests_{MPI_REQUEST_NULL, MPI_REQUEST_NULL}; std::array<MPI_Request, 2> send_requests_{MPI_REQUEST_NULL, MPI_REQUEST_NULL}; #ifndef DCA_WITH_CUDA_AWARE_MPI #ifndef DCA_HAVE_CUDA_AWARE_MPI std::array<std::vector<Complex>, 2> sendbuffer_; std::array<std::vector<Complex>, 2> recvbuffer_; #endif // DCA_WITH_CUDA_AWARE_MPI #endif // DCA_HAVE_CUDA_AWARE_MPI }; template <class Parameters> Loading @@ -138,7 +143,6 @@ TpAccumulator<Parameters, linalg::GPU, DistType::MPI>::TpAccumulator( typename BaseClass::TpDomain tp_dmn; std::size_t local_g4_size = tp_dmn.get_size(); int my_rank = 0, mpi_size = 1; MPI_Comm_size(MPI_COMM_WORLD, &mpi_size); MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); local_g4_size = dca::util::ceilDiv(local_g4_size, std::size_t(mpi_size)); Loading Loading @@ -180,7 +184,7 @@ float TpAccumulator<Parameters, linalg::GPU, DistType::MPI>::accumulate( flop += updateG4(channel); } ringG(flop); ringG(flop, meas_id); return flop; } Loading Loading @@ -265,10 +269,6 @@ float TpAccumulator<Parameters, linalg::GPU, DistType::MPI>::updateG4(const std: const FourPointType channel = BaseClass::channels_[channel_index]; int my_rank, mpi_size; MPI_Comm_size(MPI_COMM_WORLD, &mpi_size); MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); switch (channel) { case PARTICLE_HOLE_TRANSVERSE: return details::updateG4<Real, PARTICLE_HOLE_TRANSVERSE>( Loading Loading @@ -306,20 +306,10 @@ float TpAccumulator<Parameters, linalg::GPU, DistType::MPI>::updateG4(const std: } template <class Parameters> void TpAccumulator<Parameters, linalg::GPU, DistType::MPI>::ringG(float& flop) { // get ready for send and receive for (int s = 0; s < 2; ++s) { sendbuff_G_[s] = G_[s]; } int my_concurrency_id, mpi_size; MPI_Comm_size(MPI_COMM_WORLD, &mpi_size); MPI_Comm_rank(MPI_COMM_WORLD, &my_concurrency_id); void TpAccumulator<Parameters, linalg::GPU, DistType::MPI>::perform_one_communication_step(float& flop) { // get rank index of left and right neighbor int left_neighbor = (my_concurrency_id - 1 + mpi_size) % mpi_size; int right_neighbor = (my_concurrency_id + 1 + mpi_size) % mpi_size; int left_neighbor = (my_rank - 1 + mpi_size) % mpi_size; int right_neighbor = (my_rank + 1 + mpi_size) % mpi_size; // Pipepline ring algorithm in the following for-loop: // 1) At each time step, local rank receives a new G2 from left hand neighbor, Loading @@ -329,8 +319,8 @@ void TpAccumulator<Parameters, linalg::GPU, DistType::MPI>::ringG(float& flop) { // a) #walker == #accumulator and shared-walk-and-accumulation-thread = true; // b) and, local measurements are equal, and each accumulator should have same #measurement, i.e. // measurements % ranks == 0 && local_measurement % threads == 0. for (int icount = 0; icount < (mpi_size - 1); icount++) { send(sendbuff_G_, right_neighbor, send_requests_); send(odd_sendbuff_G_, right_neighbor, send_requests_); receive(G_, left_neighbor, recv_requests_); // wait for G2 to be available again Loading @@ -348,9 +338,32 @@ void TpAccumulator<Parameters, linalg::GPU, DistType::MPI>::ringG(float& flop) { // get ready for send again for (int s = 0; s < 2; ++s) { sendbuff_G_[s].swap(G_[s]); odd_sendbuff_G_[s].swap(G_[s]); } } template <class Parameters> void TpAccumulator<Parameters, linalg::GPU, DistType::MPI>::ringG(float& flop, const int meas_id) { auto is_odd = [](int const i) -> bool { return ((i % 2) == 1); }; bool const perform_bidirectional_ring = is_odd(meas_id); // if (perform_bidirectional_ring == true) { // get ready for current send and receive (i.e. meas_id = 1, 3, 5...) for (int s = 0; s < 2; ++s) { odd_sendbuff_G_[s] = G_[s]; } for (int icount = 0; icount < (mpi_size - 1); icount++) { perform_one_communication_step(flop); } // } // else { // // store G2 in even measurement step (i.e. meas_id = 0, 2, 4...) as a previous G2 // for (int s = 0; s < 2; ++s) { // even_G_[s] = G_[s]; // even_sendbuff_G_[s] = G_[s]; // } // } } template <class Parameters> Loading @@ -366,7 +379,7 @@ void TpAccumulator<Parameters, linalg::GPU, DistType::MPI>::send(const std::arra using dca::parallel::MPITypeMap; const auto g_size = data[0].size().first * data[0].size().second; #ifdef DCA_WITH_CUDA_AWARE_MPI #ifdef DCA_HAVE_CUDA_AWARE_MPI for (int s = 0; s < 2; ++s) { MPI_Isend(data[s].ptr(), g_size, MPITypeMap<Complex>::value(), target, thread_id_ + 1, MPI_COMM_WORLD, &request[s]); Loading @@ -381,7 +394,7 @@ void TpAccumulator<Parameters, linalg::GPU, DistType::MPI>::send(const std::arra MPI_Isend(sendbuffer_[s].data(), g_size, MPITypeMap<Complex>::value(), target, thread_id_ + 1, MPI_COMM_WORLD, &request[s]); } #endif // DCA_WITH_CUDA_AWARE_MPI #endif // DCA_HAVE_CUDA_AWARE_MPI } template <class Parameters> Loading @@ -390,7 +403,7 @@ void TpAccumulator<Parameters, linalg::GPU, DistType::MPI>::receive( using dca::parallel::MPITypeMap; const auto g_size = data[0].size().first * data[0].size().second; #ifdef DCA_WITH_CUDA_AWARE_MPI #ifdef DCA_HAVE_CUDA_AWARE_MPI for (int s = 0; s < 2; ++s) { MPI_Irecv(data[s].ptr(), g_size, MPITypeMap<Complex>::value(), source, thread_id_ + 1, MPI_COMM_WORLD, &request[s]); Loading @@ -408,7 +421,7 @@ void TpAccumulator<Parameters, linalg::GPU, DistType::MPI>::receive( cudaMemcpy(data[s].ptr(), recvbuffer_[s].data(), g_size * sizeof(Complex), cudaMemcpyHostToDevice); } #endif // DCA_WITH_CUDA_AWARE_MPI #endif // DCA_HAVE_CUDA_AWARE_MPI } } // namespace accumulator Loading