Gather coarsegrained G with a processor gang. (024efed6) · Commits · NDIP / Tool Sources / Direct-Geometry Spectroscopy / DCA / DCA Main

include/dca/function/domains/local_domain.hpp

0 → 100644

+99 −0

Original line number	Diff line number	Diff line
		// Copyright (C) 2018 ETH Zurich
		// Copyright (C) 2018 UT-Battelle, LLC
		// All rights reserved.
		//
		// See LICENSE for terms of usage.
		// See CITATION.md for citation guidelines, if DCA++ is used for scientific publications.
		//
		// Author: Giovanni Balduzzi (gbalduzz@itp.phys.ethz.ch)
		//
		// This file implements a class to store a subset of a domain distributed among different MPI ranks.
		//
		// INTERNAL: This class can only be used with dmn_variadic when it's wrapped with dmn_0.

		#ifndef DCA_FUNCTION_DOMAINS_LOCAL_DOMAIN_HPP
		#define DCA_FUNCTION_DOMAINS_LOCAL_DOMAIN_HPP

		#include <iostream>
		#include <stdexcept>
		#include <string>
		#include <vector>

		#include "dca/util/integer_division.hpp"

		namespace dca {
		namespace func {
		// dca::func::

		template <typename BaseDomain, int id = 0>
		class LocalDomain {
		public:
		using element_type = typename BaseDomain::element_type; // For putting LocalDomain in dmn_0.
		using this_type = LocalDomain<BaseDomain>;

		template <class Grouping>
		static void initialize(const Grouping& group);

		static std::size_t get_physical_size() {
		return elements_.size();
		}
		static std::size_t get_offset() {
		return offset_;
		}

		static std::size_t get_size() {
		assert(initialized_);
		return padded_size_;
		}
		static std::string get_name() {
		return "Local_" + BaseDomain::get_name();
		}
		static const std::vector<element_type>& get_elements() {
		assert(initialized_);
		return elements_;
		}
		static bool is_initialized() {
		return initialized_;
		}

		private:
		static bool initialized_;
		static std::vector<element_type> elements_;
		static std::size_t padded_size_;
		static std::size_t offset_;
		};

		template <class BaseDomain, int id>
		bool LocalDomain<BaseDomain, id>::initialized_ = false;
		template <class BaseDomain, int id>
		std::vector<typename LocalDomain<BaseDomain, id>::element_type> LocalDomain<BaseDomain, id>::elements_;
		template <class BaseDomain, int id>
		std::size_t LocalDomain<BaseDomain, id>::padded_size_ = 0;
		template <class BaseDomain, int id>
		std::size_t LocalDomain<BaseDomain, id>::offset_ = 0;

		template <class BaseDomain, int id>
		template <class Grouping>
		void LocalDomain<BaseDomain, id>::initialize(const Grouping& group) {
		if (initialized_) {
		throw(std::logic_error("Domain " + get_name() + " is already initialized."));
		}

		const std::size_t global_size = BaseDomain::get_size();
		padded_size_ = dca::util::ceilDiv(global_size, std::size_t(group.get_size()));

		const std::size_t start = std::min(padded_size_ * group.get_id(), global_size);
		const std::size_t end = std::min(start + padded_size_, global_size);

		const auto physical_size = end - start;
		offset_ = start;
		elements_.resize(physical_size);
		std::copy_n(BaseDomain::get_elements().data() + start, physical_size, elements_.data());

		initialized_ = true;
		}

		} // namespace func
		} // namespace dca

		#endif // DCA_FUNCTION_DOMAINS_LOCAL_DOMAIN_HPP

include/dca/parallel/mpi_concurrency/mpi_collective_sum.hpp

+8 −1

Original line number	Diff line number	Diff line
		@@ -41,6 +41,7 @@ class MPICollectiveSum : public virtual MPIProcessorGrouping {
		public:
		MPICollectiveSum() = default;

		// Wrappers to MPI_Allreduce.
		template <typename scalar_type>
		void sum(scalar_type& value) const;
		template <typename scalar_type>
		@@ -59,9 +60,12 @@ public:
		template <typename scalar_type>
		void sum(linalg::Matrix<scalar_type, linalg::CPU>& f) const;

		// Wrapper to MPI_Reduce.
		template <typename Scalar, class Domain>
		void localSum(func::function<Scalar, Domain>& f, int root_id) const;

		// Delay the execution of sum (implemented with MPI_Allreduce) until 'resolveSums' is called,
		// or 'delayedSum' is called with an object of different Scalar type.
		template <typename Scalar>
		void delayedSum(Scalar& obj);
		template <typename Scalar>
		@@ -69,6 +73,7 @@ public:
		template <typename Scalar, class domain>
		void delayedSum(func::function<Scalar, domain>& f);

		// Execute all the reductions scheduled with 'delayedSum' or delayed leaveOneOutSum out calls.
		void resolveSums();

		template <typename some_type>
		@@ -87,11 +92,13 @@ public:
		// in s.
		// Does nothing, if there is only one rank.
		// In/Out: s
		// In: delay. If true delay the sum until 'resolveSums' is called.
		template <typename T>
		void leaveOneOutSum(T& s, bool delay = 0);

		// Element-wise implementations for dca::func::function.
		// In/Out: f
		// In: delay. If true delay the sum until 'resolveSums' is called.
		template <typename Scalar, class Domain>
		void leaveOneOutSum(func::function<Scalar, Domain>& f, bool delay = 0);

		@@ -561,7 +568,7 @@ std::vector<Scalar> MPICollectiveSum::avgNormalizedMomenta(const func::function<

		template <typename T>
		void MPICollectiveSum::sum(const T* in, T* out, std::size_t n, int id) const {
		// On summit large messages hangs if sizeof(floating point type) type * message_size > 2^31-1.
		// On summit large messages hangs if sizeof(floating point type) * message_size > 2^31-1.
		constexpr std::size_t max_size = dca::util::IsComplex<T>::value
		? 2 * (std::numeric_limits<int>::max() / sizeof(T))
		: std::numeric_limits<int>::max() / sizeof(T);

include/dca/parallel/mpi_concurrency/mpi_concurrency.hpp

+15 −4

Original line number	Diff line number	Diff line
		@@ -25,6 +25,8 @@
		#include "dca/parallel/mpi_concurrency/mpi_collective_min.hpp"
		#include "dca/parallel/mpi_concurrency/mpi_collective_sum.hpp"
		#include "dca/parallel/mpi_concurrency/mpi_initializer.hpp"
		#include "dca/parallel/mpi_concurrency/mpi_gang.hpp"
		#include "dca/parallel/mpi_concurrency/mpi_gather.hpp"
		#include "dca/parallel/mpi_concurrency/mpi_packing.hpp"
		#include "dca/parallel/mpi_concurrency/mpi_processor_grouping.hpp"
		#include "dca/parallel/util/get_bounds.hpp"
		@@ -34,12 +36,15 @@ namespace parallel {
		// dca::parallel::

		class MPIConcurrency final : public virtual MPIInitializer,
		private virtual MPIProcessorGrouping,
		public virtual MPIProcessorGrouping,
		public MPIPacking,
		public MPICollectiveMax,
		public MPICollectiveMin,
		public MPICollectiveSum {
		public MPICollectiveSum,
		public MPIGather {
		public:
		using Gang = MPIGang;

		MPIConcurrency(int argc, char** argv);

		inline int id() const {
		@@ -68,6 +73,12 @@ public:
		return util::getBounds(id(), number_of_processors(), dmn);
		}

		// Using gather with no gang uses the entire concurrency.
		// template <class Scalar, class DmnIn, class DmnOut>
		// void gather(const func::function<Scalar, DmnIn>& f_in, func::function<Scalar, DmnOut>& f_out) const {
		// gather(f_in, f_out, *this);
		// }

		friend std::ostream& operator<<(std::ostream& some_ostream, const MPIConcurrency& this_concurrency);

		private:
		@@ -147,7 +158,7 @@ bool MPIConcurrency::broadcast_object(object_type& object, int root_id) const {
		return true;
		}

		} // parallel
		} // dca
		} // namespace parallel
		} // namespace dca

		#endif // DCA_PARALLEL_MPI_CONCURRENCY_MPI_CONCURRENCY_HPP

include/dca/parallel/mpi_concurrency/mpi_gang.hpp

0 → 100644

+50 −0

Original line number	Diff line number	Diff line
		// Copyright (C) 2018 ETH Zurich
		// Copyright (C) 2018 UT-Battelle, LLC
		// All rights reserved.
		//
		// See LICENSE for terms of usage.
		// See CITATION.md for citation guidelines, if DCA++ is used for scientific publications.
		//
		// Author: Giovanni Balduzzi (gbalduzz@itp.phys.ethz.ch)
		//
		// This class represent a subgroup of MPIProcessorGrouping.

		#ifndef DCA_PARALLEL_MPI_CONCURRENCY_MPI_GANG_HPP
		#define DCA_PARALLEL_MPI_CONCURRENCY_MPI_GANG_HPP

		#include <mpi.h>

		#include "dca/parallel/mpi_concurrency/mpi_processor_grouping.hpp"

		namespace dca {
		namespace parallel {
		// dca::parallel::

		class MPIGang {
		public:
		MPIGang(const MPIProcessorGrouping& group, int min_size);
		~MPIGang();

		MPIGang(const MPIGang& other) = delete;

		int get_id() const {
		return id_;
		}
		int get_size() const {
		return size_;
		}

		auto get() const {
		return communicator_;
		}

		private:
		MPI_Comm communicator_;
		int id_;
		int size_;
		};

		} // namespace parallel
		} // namespace dca

		#endif // DCA_PARALLEL_MPI_CONCURRENCY_MPI_GANG_HPP

include/dca/parallel/mpi_concurrency/mpi_gather.hpp

0 → 100644

+67 −0

Original line number	Diff line number	Diff line
		// Copyright (C) 2018 ETH Zurich
		// Copyright (C) 2018 UT-Battelle, LLC
		// All rights reserved.
		//
		// See LICENSE for terms of usage.
		// See CITATION.md for citation guidelines, if DCA++ is used for scientific publications.
		//
		// Author: Giovanni Balduzzi (gbalduzz@itp.phys.ethz.ch)
		//
		// This class provides an interface to gather functions with MPI.

		#ifndef DCA_PARALLEL_MPI_CONCURRENCY_MPI_GATHER_HPP
		#define DCA_PARALLEL_MPI_CONCURRENCY_MPI_GATHER_HPP

		#include <vector>

		#include <mpi.h>

		#include "dca/function/domains.hpp"
		#include "dca/function/function.hpp"
		#include "dca/parallel/mpi_concurrency/mpi_gang.hpp"
		#include "dca/parallel/mpi_concurrency/mpi_type_map.hpp"
		#include "dca/util/integer_division.hpp"

		namespace dca {
		namespace parallel {
		// dca::parallel::

		class MPIGather {
		public:
		MPIGather() = default;

		// Gather the function 'f_in' on all processes in 'gang' and copy the result into 'f_out',
		// discarding eventual padding.
		// Precondition: gang.get_size() * f_in.size() >= f_out.size()
		template <class Scalar, class DmnIn, class DmnOut, class Gang>
		void gather(const func::function<Scalar, DmnIn>& f_in, func::function<Scalar, DmnOut>& f_out,
		const Gang& gang) const;

		private:
		template <class T, class Gang>
		void gather(const T* in, T* out, int local_n, const Gang& gang) const;
		};

		template <class Scalar, class DmnIn, class DmnOut, class Gang>
		void MPIGather::gather(const func::function<Scalar, DmnIn>& f_in,
		func::function<Scalar, DmnOut>& f_out, const Gang& gang) const {
		std::vector<Scalar> gathered(f_in.size() * gang.get_size());
		gather(f_in.values(), gathered.data(), f_in.size(), gang);

		if (f_out.size() > gathered.size())
		throw(std::logic_error("Output function is too large."));

		// TODO: move.
		std::copy_n(gathered.data(), f_out.size(), f_out.values());
		}

		template <class T, class Gang>
		void MPIGather::gather(const T* in, T* out, int local_n, const Gang& gang) const {
		MPI_Allgather(in, local_n, MPITypeMap<T>::value(), out, local_n, MPITypeMap<T>::value(),
		gang.get());
		}

		} // namespace parallel
		} // namespace dca

		#endif // DCA_PARALLEL_MPI_CONCURRENCY_MPI_GATHER_HPP