Loading include/dca/function/function.hpp +46 −5 Original line number Diff line number Diff line Loading @@ -32,6 +32,9 @@ #include "dca/util/pack_operations.hpp" #include "dca/util/type_utils.hpp" #include "dca/parallel/util/get_workload.hpp" #include "mpi.h" namespace dca { namespace func { // dca::func:: Loading @@ -47,7 +50,9 @@ public: // Default constructor // Constructs the function with the name name. // Postcondition: All elements are set to zero. function(const std::string& name = default_name_); // Special case: when distributed_g4_enabled, G4 related variables only gets // allocation of 1/p of original G4 size, where p = #mpiranks function(const std::string& name = default_name_, const bool distributed_g4_enabled = false); // Copy constructor // Constructs the function with the a copy of elements and name of other. Loading Loading @@ -111,6 +116,10 @@ public: std::size_t size() const { return Nb_elements; } void resize(std::size_t Nb_elements_new) { Nb_elements = Nb_elements_new; } // Returns the size of the leaf domain with the given index. // Does not return function values! int operator[](const int index) const { Loading Loading @@ -156,12 +165,18 @@ public: void linind_2_subind(int linind, int* subind) const; // std::vector version void linind_2_subind(int linind, std::vector<int>& subind) const; // modern RVO version std::vector<int> linind_2_subind(int linind) const; // Computes the linear index for the given subindices of the leaf domains. // Precondition: subind stores the the subindices of all LEAF domains. // TODO: Use std::array or std::vector to be able to check the size of subind. void subind_2_linind(const int* subind, int& linind) const; // using standard vector and avoiding returning argument int subind_2_linind(const std::vector<int>& subind) const; // Computes and returns the linear index for the given subindices of the branch or leaf domains, // depending on the size of subindices. // Enable only if all arguments are integral to prevent subind_to_linind(int*, int) to resolve to Loading Loading @@ -276,7 +291,7 @@ template <typename scalartype, class domain> const std::string function<scalartype, domain>::default_name_ = "no-name"; template <typename scalartype, class domain> function<scalartype, domain>::function(const std::string& name) function<scalartype, domain>::function(const std::string& name, const bool distributed_g4_enabled) : name_(name), function_type(__PRETTY_FUNCTION__), dmn(), Loading @@ -285,6 +300,13 @@ function<scalartype, domain>::function(const std::string& name) size_sbdm(dmn.get_leaf_domain_sizes()), step_sbdm(dmn.get_leaf_domain_steps()), fnc_values(nullptr) { if(name.substr(0, 2) == "G4" && distributed_g4_enabled) { int my_rank, mpi_size; MPI_Comm_size(MPI_COMM_WORLD, &mpi_size); MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); Nb_elements = dca::parallel::util::getWorkload(dmn.get_size(), mpi_size, my_rank); } fnc_values = new scalartype[Nb_elements]; for (int linind = 0; linind < Nb_elements; ++linind) setToZero(fnc_values[linind]); Loading Loading @@ -421,6 +443,17 @@ void function<scalartype, domain>::linind_2_subind(int linind, std::vector<int>& } } template <typename scalartype, class domain> std::vector<int> function<scalartype, domain>::linind_2_subind(int linind) const { std::vector<int> subind(Nb_sbdms); for (int i = 0; i < int(size_sbdm.size()); ++i) { subind[i] = linind % size_sbdm[i]; linind = (linind - subind[i]) / size_sbdm[i]; } return subind; } template <typename scalartype, class domain> void function<scalartype, domain>::subind_2_linind(const int* const subind, int& linind) const { linind = 0; Loading @@ -428,6 +461,14 @@ void function<scalartype, domain>::subind_2_linind(const int* const subind, int& linind += subind[i] * step_sbdm[i]; } template <typename scalartype, class domain> int function<scalartype, domain>::subind_2_linind(const std::vector<int>& subind) const { int linind = 0; for (int i = 0; i < int(step_sbdm.size()); ++i) linind += subind[i] * step_sbdm[i]; return linind; } template <typename scalartype, class domain> scalartype& function<scalartype, domain>::operator()(const int* const subind) { int linind; Loading include/dca/linalg/reshapable_matrix.hpp +6 −10 Original line number Diff line number Diff line Loading @@ -218,12 +218,11 @@ ReshapableMatrix<ScalarType, device_name, Allocator>::ReshapableMatrix( template <typename ScalarType, DeviceType device_name, class Allocator> ReshapableMatrix<ScalarType, device_name, Allocator>& ReshapableMatrix< ScalarType, device_name, Allocator>::operator=(const ThisType& rhs) { size_ = rhs.size_; capacity_ = rhs.capacity_; Allocator::deallocate(data_); data_ = Allocator::allocate(capacity_); if (this != &rhs) { resizeNoCopy(rhs.size_); util::memoryCopy(data_, leadingDimension(), rhs.data_, rhs.leadingDimension(), size_); } return *this; } Loading @@ -232,12 +231,9 @@ template <DeviceType rhs_device_name, class AllocatorRhs> ReshapableMatrix<ScalarType, device_name, Allocator>& ReshapableMatrix< ScalarType, device_name, Allocator>::operator=(const ReshapableMatrix<ScalarType, rhs_device_name, AllocatorRhs>& rhs) { size_ = rhs.size_; capacity_ = rhs.capacity_; Allocator::deallocate(data_); data_ = Allocator::allocate(capacity_); resizeNoCopy(rhs.size_); util::memoryCopy(data_, leadingDimension(), rhs.data_, rhs.leadingDimension(), size_); return *this; } Loading include/dca/parallel/mpi_concurrency/mpi_collective_sum.hpp +76 −0 Original line number Diff line number Diff line Loading @@ -17,6 +17,7 @@ #define DCA_PARALLEL_MPI_CONCURRENCY_MPI_COLLECTIVE_SUM_HPP #include <algorithm> // std::min #include <numeric> // std::partial_sum #include <map> #include <string> #include <utility> // std::move, std::swap Loading Loading @@ -64,6 +65,12 @@ public: template <typename Scalar, class Domain> void localSum(func::function<Scalar, Domain>& f, int root_id) const; // Wrapper to MPI_Reduce. Gathers into specified locations from all processes in a group // Designed for collecting G4 when distributed_g4_enabled() == true but only for testing purpose // As if G4 is large enough that cannot fit into one GPU, one should not call this method template <typename Scalar, class Domain> void gatherv(func::function<Scalar, Domain>& f, int root_id) const; // Delay the execution of sum (implemented with MPI_Allreduce) until 'resolveSums' is called, // or 'delayedSum' is called with an object of different Scalar type. template <typename Scalar> Loading Loading @@ -166,6 +173,12 @@ private: template <typename T> void sum(const T* in, T* out, std::size_t n, int rank_id = -1) const; // Gather results accross ranks on process 'rank_id' // Designed for collecting G4 when distributed_g4_enabled() == true but only for testing purpose // As if G4 is large enough that cannot fit into one GPU, one should not call this method template <typename T> void gatherv_helper(const T* in, T* out, std::size_t total_size, int root_id = 0) const; template <typename T> void delayedSum(T* in, std::size_t n); template <typename T> Loading Loading @@ -294,6 +307,26 @@ void MPICollectiveSum::localSum(func::function<scalar_type, domain>& f, int id) f = std::move(f_sum); } template <typename scalar_type, class domain> void MPICollectiveSum::gatherv(func::function<scalar_type, domain>& f, int id) const { if (id < 0 || id > get_size()) throw(std::out_of_range("id out of range.")); func::function<scalar_type, domain> f_sum; int my_rank, mpi_size; MPI_Comm_size(MPI_COMM_WORLD, &mpi_size); MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); if(my_rank == 0) { std::cout << "\n *********performing MPI Gatherv Sum************* \n"; } my_rank == 0 ? gatherv_helper(f.values(), f.values(), f.get_domain().get_size(), id) : gatherv_helper(f.values(), f_sum.values(), f.get_domain().get_size(), id); } template <typename some_type> void MPICollectiveSum::sum_and_average(some_type& obj, const int nr_meas_rank) const { sum(obj); Loading Loading @@ -577,6 +610,49 @@ void MPICollectiveSum::sum(const T* in, T* out, std::size_t n, int root_id) cons } } template <typename T> void MPICollectiveSum::gatherv_helper(const T* in, T* out, std::size_t total_size, int root_id) const { int mpi_size = MPIProcessorGrouping::get_size(); int my_rank = MPIProcessorGrouping::get_id(); uint64_t local_work = total_size / mpi_size; uint64_t more_work_before_index; std::vector<int> ranks_workload(mpi_size, local_work); // displs: integer array (of length group size). // We reserve mpi_size + 1 space, one extra space to fit STL algorithm logic. // Entry i specifies the displacement relative to recvbuf at which to place // the incoming data from process i (significant only at root) std::vector<int> displs(mpi_size + 1, 0); int* p_ranks_workload = ranks_workload.data(); int* p_displs = displs.data(); bool balanced = (total_size % mpi_size) == 0 ? true : false; if(balanced) { // offset displs for each rank std::partial_sum(ranks_workload.begin(), ranks_workload.end(), displs.begin() + 1, std::plus<int>()); // remove last running sum displs.pop_back(); } else { more_work_before_index = total_size % mpi_size; std::transform(ranks_workload.begin(), ranks_workload.begin() + more_work_before_index, ranks_workload.begin(), [](int ele){ return ele+1; }); std::partial_sum(ranks_workload.begin(), ranks_workload.end(), displs.begin() + 1, std::plus<int>()); // remove last running sum displs.pop_back(); } MPI_Gatherv(in, ranks_workload[my_rank], MPITypeMap<T>::value(), out, p_ranks_workload, p_displs, MPITypeMap<T>::value(), root_id, MPIProcessorGrouping::get()); } template <typename Scalar> void MPICollectiveSum::delayedSum(Scalar& obj) { delayedSum(&obj, 1); Loading include/dca/parallel/no_concurrency/serial_collective_sum.hpp +3 −0 Original line number Diff line number Diff line Loading @@ -39,6 +39,9 @@ public: template<class T> void localSum(const T& , int ){} template<class T> void gatherv(const T& , int ){} template <class T> void delayedSum(T&) const {} void resolveSums() const {} Loading include/dca/parallel/util/get_workload.hpp +33 −0 Original line number Diff line number Diff line Loading @@ -42,6 +42,39 @@ int getWorkload(const unsigned int total_work, const unsigned int n_local_worker return getWorkload(local_work, n_local_workers, local_id); } /** This returns the first and last linear indexes, not the last + 1 * * i.e. write for(index i = 0; i <= end; ++i) ... * this with getting the proper subindices and this being integral indexes and not iterators */ inline void getComputeRange(const int& my_rank, const int& mpi_size, const uint64_t& total_G4_size, uint64_t& start, uint64_t& end) { uint64_t offset = 0; // check if originally flattened one-dimensional G4 array can be equally (up to 0) distributed across ranks // if balanced, each rank has same amount of elements to compute // if not, ranks with (rank_id < more_work_ranks) has to compute 1 more element than other ranks bool balanced = (total_G4_size % static_cast<uint64_t>(mpi_size) == 0); uint64_t local_work = total_G4_size / static_cast<uint64_t>(mpi_size); if(balanced) { offset = static_cast<uint64_t>(my_rank) * local_work; end = offset + local_work - 1; } else { int more_work_ranks = total_G4_size % static_cast<uint64_t>(mpi_size); if (my_rank < more_work_ranks) { offset = static_cast<uint64_t>(my_rank) * (local_work + 1); end = offset + (local_work + 1); } else { offset = more_work_ranks * (local_work + 1) + (static_cast<uint64_t>(my_rank) - more_work_ranks) * local_work; end = offset + local_work; } } start = offset; } } // util } // parallel } // dca Loading Loading
include/dca/function/function.hpp +46 −5 Original line number Diff line number Diff line Loading @@ -32,6 +32,9 @@ #include "dca/util/pack_operations.hpp" #include "dca/util/type_utils.hpp" #include "dca/parallel/util/get_workload.hpp" #include "mpi.h" namespace dca { namespace func { // dca::func:: Loading @@ -47,7 +50,9 @@ public: // Default constructor // Constructs the function with the name name. // Postcondition: All elements are set to zero. function(const std::string& name = default_name_); // Special case: when distributed_g4_enabled, G4 related variables only gets // allocation of 1/p of original G4 size, where p = #mpiranks function(const std::string& name = default_name_, const bool distributed_g4_enabled = false); // Copy constructor // Constructs the function with the a copy of elements and name of other. Loading Loading @@ -111,6 +116,10 @@ public: std::size_t size() const { return Nb_elements; } void resize(std::size_t Nb_elements_new) { Nb_elements = Nb_elements_new; } // Returns the size of the leaf domain with the given index. // Does not return function values! int operator[](const int index) const { Loading Loading @@ -156,12 +165,18 @@ public: void linind_2_subind(int linind, int* subind) const; // std::vector version void linind_2_subind(int linind, std::vector<int>& subind) const; // modern RVO version std::vector<int> linind_2_subind(int linind) const; // Computes the linear index for the given subindices of the leaf domains. // Precondition: subind stores the the subindices of all LEAF domains. // TODO: Use std::array or std::vector to be able to check the size of subind. void subind_2_linind(const int* subind, int& linind) const; // using standard vector and avoiding returning argument int subind_2_linind(const std::vector<int>& subind) const; // Computes and returns the linear index for the given subindices of the branch or leaf domains, // depending on the size of subindices. // Enable only if all arguments are integral to prevent subind_to_linind(int*, int) to resolve to Loading Loading @@ -276,7 +291,7 @@ template <typename scalartype, class domain> const std::string function<scalartype, domain>::default_name_ = "no-name"; template <typename scalartype, class domain> function<scalartype, domain>::function(const std::string& name) function<scalartype, domain>::function(const std::string& name, const bool distributed_g4_enabled) : name_(name), function_type(__PRETTY_FUNCTION__), dmn(), Loading @@ -285,6 +300,13 @@ function<scalartype, domain>::function(const std::string& name) size_sbdm(dmn.get_leaf_domain_sizes()), step_sbdm(dmn.get_leaf_domain_steps()), fnc_values(nullptr) { if(name.substr(0, 2) == "G4" && distributed_g4_enabled) { int my_rank, mpi_size; MPI_Comm_size(MPI_COMM_WORLD, &mpi_size); MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); Nb_elements = dca::parallel::util::getWorkload(dmn.get_size(), mpi_size, my_rank); } fnc_values = new scalartype[Nb_elements]; for (int linind = 0; linind < Nb_elements; ++linind) setToZero(fnc_values[linind]); Loading Loading @@ -421,6 +443,17 @@ void function<scalartype, domain>::linind_2_subind(int linind, std::vector<int>& } } template <typename scalartype, class domain> std::vector<int> function<scalartype, domain>::linind_2_subind(int linind) const { std::vector<int> subind(Nb_sbdms); for (int i = 0; i < int(size_sbdm.size()); ++i) { subind[i] = linind % size_sbdm[i]; linind = (linind - subind[i]) / size_sbdm[i]; } return subind; } template <typename scalartype, class domain> void function<scalartype, domain>::subind_2_linind(const int* const subind, int& linind) const { linind = 0; Loading @@ -428,6 +461,14 @@ void function<scalartype, domain>::subind_2_linind(const int* const subind, int& linind += subind[i] * step_sbdm[i]; } template <typename scalartype, class domain> int function<scalartype, domain>::subind_2_linind(const std::vector<int>& subind) const { int linind = 0; for (int i = 0; i < int(step_sbdm.size()); ++i) linind += subind[i] * step_sbdm[i]; return linind; } template <typename scalartype, class domain> scalartype& function<scalartype, domain>::operator()(const int* const subind) { int linind; Loading
include/dca/linalg/reshapable_matrix.hpp +6 −10 Original line number Diff line number Diff line Loading @@ -218,12 +218,11 @@ ReshapableMatrix<ScalarType, device_name, Allocator>::ReshapableMatrix( template <typename ScalarType, DeviceType device_name, class Allocator> ReshapableMatrix<ScalarType, device_name, Allocator>& ReshapableMatrix< ScalarType, device_name, Allocator>::operator=(const ThisType& rhs) { size_ = rhs.size_; capacity_ = rhs.capacity_; Allocator::deallocate(data_); data_ = Allocator::allocate(capacity_); if (this != &rhs) { resizeNoCopy(rhs.size_); util::memoryCopy(data_, leadingDimension(), rhs.data_, rhs.leadingDimension(), size_); } return *this; } Loading @@ -232,12 +231,9 @@ template <DeviceType rhs_device_name, class AllocatorRhs> ReshapableMatrix<ScalarType, device_name, Allocator>& ReshapableMatrix< ScalarType, device_name, Allocator>::operator=(const ReshapableMatrix<ScalarType, rhs_device_name, AllocatorRhs>& rhs) { size_ = rhs.size_; capacity_ = rhs.capacity_; Allocator::deallocate(data_); data_ = Allocator::allocate(capacity_); resizeNoCopy(rhs.size_); util::memoryCopy(data_, leadingDimension(), rhs.data_, rhs.leadingDimension(), size_); return *this; } Loading
include/dca/parallel/mpi_concurrency/mpi_collective_sum.hpp +76 −0 Original line number Diff line number Diff line Loading @@ -17,6 +17,7 @@ #define DCA_PARALLEL_MPI_CONCURRENCY_MPI_COLLECTIVE_SUM_HPP #include <algorithm> // std::min #include <numeric> // std::partial_sum #include <map> #include <string> #include <utility> // std::move, std::swap Loading Loading @@ -64,6 +65,12 @@ public: template <typename Scalar, class Domain> void localSum(func::function<Scalar, Domain>& f, int root_id) const; // Wrapper to MPI_Reduce. Gathers into specified locations from all processes in a group // Designed for collecting G4 when distributed_g4_enabled() == true but only for testing purpose // As if G4 is large enough that cannot fit into one GPU, one should not call this method template <typename Scalar, class Domain> void gatherv(func::function<Scalar, Domain>& f, int root_id) const; // Delay the execution of sum (implemented with MPI_Allreduce) until 'resolveSums' is called, // or 'delayedSum' is called with an object of different Scalar type. template <typename Scalar> Loading Loading @@ -166,6 +173,12 @@ private: template <typename T> void sum(const T* in, T* out, std::size_t n, int rank_id = -1) const; // Gather results accross ranks on process 'rank_id' // Designed for collecting G4 when distributed_g4_enabled() == true but only for testing purpose // As if G4 is large enough that cannot fit into one GPU, one should not call this method template <typename T> void gatherv_helper(const T* in, T* out, std::size_t total_size, int root_id = 0) const; template <typename T> void delayedSum(T* in, std::size_t n); template <typename T> Loading Loading @@ -294,6 +307,26 @@ void MPICollectiveSum::localSum(func::function<scalar_type, domain>& f, int id) f = std::move(f_sum); } template <typename scalar_type, class domain> void MPICollectiveSum::gatherv(func::function<scalar_type, domain>& f, int id) const { if (id < 0 || id > get_size()) throw(std::out_of_range("id out of range.")); func::function<scalar_type, domain> f_sum; int my_rank, mpi_size; MPI_Comm_size(MPI_COMM_WORLD, &mpi_size); MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); if(my_rank == 0) { std::cout << "\n *********performing MPI Gatherv Sum************* \n"; } my_rank == 0 ? gatherv_helper(f.values(), f.values(), f.get_domain().get_size(), id) : gatherv_helper(f.values(), f_sum.values(), f.get_domain().get_size(), id); } template <typename some_type> void MPICollectiveSum::sum_and_average(some_type& obj, const int nr_meas_rank) const { sum(obj); Loading Loading @@ -577,6 +610,49 @@ void MPICollectiveSum::sum(const T* in, T* out, std::size_t n, int root_id) cons } } template <typename T> void MPICollectiveSum::gatherv_helper(const T* in, T* out, std::size_t total_size, int root_id) const { int mpi_size = MPIProcessorGrouping::get_size(); int my_rank = MPIProcessorGrouping::get_id(); uint64_t local_work = total_size / mpi_size; uint64_t more_work_before_index; std::vector<int> ranks_workload(mpi_size, local_work); // displs: integer array (of length group size). // We reserve mpi_size + 1 space, one extra space to fit STL algorithm logic. // Entry i specifies the displacement relative to recvbuf at which to place // the incoming data from process i (significant only at root) std::vector<int> displs(mpi_size + 1, 0); int* p_ranks_workload = ranks_workload.data(); int* p_displs = displs.data(); bool balanced = (total_size % mpi_size) == 0 ? true : false; if(balanced) { // offset displs for each rank std::partial_sum(ranks_workload.begin(), ranks_workload.end(), displs.begin() + 1, std::plus<int>()); // remove last running sum displs.pop_back(); } else { more_work_before_index = total_size % mpi_size; std::transform(ranks_workload.begin(), ranks_workload.begin() + more_work_before_index, ranks_workload.begin(), [](int ele){ return ele+1; }); std::partial_sum(ranks_workload.begin(), ranks_workload.end(), displs.begin() + 1, std::plus<int>()); // remove last running sum displs.pop_back(); } MPI_Gatherv(in, ranks_workload[my_rank], MPITypeMap<T>::value(), out, p_ranks_workload, p_displs, MPITypeMap<T>::value(), root_id, MPIProcessorGrouping::get()); } template <typename Scalar> void MPICollectiveSum::delayedSum(Scalar& obj) { delayedSum(&obj, 1); Loading
include/dca/parallel/no_concurrency/serial_collective_sum.hpp +3 −0 Original line number Diff line number Diff line Loading @@ -39,6 +39,9 @@ public: template<class T> void localSum(const T& , int ){} template<class T> void gatherv(const T& , int ){} template <class T> void delayedSum(T&) const {} void resolveSums() const {} Loading
include/dca/parallel/util/get_workload.hpp +33 −0 Original line number Diff line number Diff line Loading @@ -42,6 +42,39 @@ int getWorkload(const unsigned int total_work, const unsigned int n_local_worker return getWorkload(local_work, n_local_workers, local_id); } /** This returns the first and last linear indexes, not the last + 1 * * i.e. write for(index i = 0; i <= end; ++i) ... * this with getting the proper subindices and this being integral indexes and not iterators */ inline void getComputeRange(const int& my_rank, const int& mpi_size, const uint64_t& total_G4_size, uint64_t& start, uint64_t& end) { uint64_t offset = 0; // check if originally flattened one-dimensional G4 array can be equally (up to 0) distributed across ranks // if balanced, each rank has same amount of elements to compute // if not, ranks with (rank_id < more_work_ranks) has to compute 1 more element than other ranks bool balanced = (total_G4_size % static_cast<uint64_t>(mpi_size) == 0); uint64_t local_work = total_G4_size / static_cast<uint64_t>(mpi_size); if(balanced) { offset = static_cast<uint64_t>(my_rank) * local_work; end = offset + local_work - 1; } else { int more_work_ranks = total_G4_size % static_cast<uint64_t>(mpi_size); if (my_rank < more_work_ranks) { offset = static_cast<uint64_t>(my_rank) * (local_work + 1); end = offset + (local_work + 1); } else { offset = more_work_ranks * (local_work + 1) + (static_cast<uint64_t>(my_rank) - more_work_ranks) * local_work; end = offset + local_work; } } start = offset; } } // util } // parallel } // dca Loading