Commit 024efed6 authored by gbalduzz's avatar gbalduzz
Browse files

Gather coarsegrained G with a processor gang.

parent 9495bfe6
Loading
Loading
Loading
Loading
+99 −0
Original line number Diff line number Diff line
// Copyright (C) 2018 ETH Zurich
// Copyright (C) 2018 UT-Battelle, LLC
// All rights reserved.
//
// See LICENSE for terms of usage.
// See CITATION.md for citation guidelines, if DCA++ is used for scientific publications.
//
// Author: Giovanni Balduzzi (gbalduzz@itp.phys.ethz.ch)
//
// This file implements a class to store a subset of a domain distributed among different MPI ranks.
//
// INTERNAL: This class can only be used with dmn_variadic when it's wrapped with dmn_0.

#ifndef DCA_FUNCTION_DOMAINS_LOCAL_DOMAIN_HPP
#define DCA_FUNCTION_DOMAINS_LOCAL_DOMAIN_HPP

#include <iostream>
#include <stdexcept>
#include <string>
#include <vector>

#include "dca/util/integer_division.hpp"

namespace dca {
namespace func {
// dca::func::

template <typename BaseDomain, int id = 0>
class LocalDomain {
public:
  using element_type = typename BaseDomain::element_type;  // For putting LocalDomain in dmn_0.
  using this_type = LocalDomain<BaseDomain>;

  template <class Grouping>
  static void initialize(const Grouping& group);

  static std::size_t get_physical_size() {
    return elements_.size();
  }
  static std::size_t get_offset() {
    return offset_;
  }

  static std::size_t get_size() {
    assert(initialized_);
    return padded_size_;
  }
  static std::string get_name() {
    return "Local_" + BaseDomain::get_name();
  }
  static const std::vector<element_type>& get_elements() {
    assert(initialized_);
    return elements_;
  }
  static bool is_initialized() {
    return initialized_;
  }

private:
  static bool initialized_;
  static std::vector<element_type> elements_;
  static std::size_t padded_size_;
  static std::size_t offset_;
};

template <class BaseDomain, int id>
bool LocalDomain<BaseDomain, id>::initialized_ = false;
template <class BaseDomain, int id>
std::vector<typename LocalDomain<BaseDomain, id>::element_type> LocalDomain<BaseDomain, id>::elements_;
template <class BaseDomain, int id>
std::size_t LocalDomain<BaseDomain, id>::padded_size_ = 0;
template <class BaseDomain, int id>
std::size_t LocalDomain<BaseDomain, id>::offset_ = 0;

template <class BaseDomain, int id>
template <class Grouping>
void LocalDomain<BaseDomain, id>::initialize(const Grouping& group) {
  if (initialized_) {
    throw(std::logic_error("Domain " + get_name() + " is already initialized."));
  }

  const std::size_t global_size = BaseDomain::get_size();
  padded_size_ = dca::util::ceilDiv(global_size, std::size_t(group.get_size()));

  const std::size_t start = std::min(padded_size_ * group.get_id(), global_size);
  const std::size_t end = std::min(start + padded_size_, global_size);

  const auto physical_size = end - start;
  offset_ = start;
  elements_.resize(physical_size);
  std::copy_n(BaseDomain::get_elements().data() + start, physical_size, elements_.data());

  initialized_ = true;
}

}  // namespace func
}  // namespace dca

#endif  // DCA_FUNCTION_DOMAINS_LOCAL_DOMAIN_HPP
+8 −1
Original line number Diff line number Diff line
@@ -41,6 +41,7 @@ class MPICollectiveSum : public virtual MPIProcessorGrouping {
public:
  MPICollectiveSum() = default;

  // Wrappers to MPI_Allreduce.
  template <typename scalar_type>
  void sum(scalar_type& value) const;
  template <typename scalar_type>
@@ -59,9 +60,12 @@ public:
  template <typename scalar_type>
  void sum(linalg::Matrix<scalar_type, linalg::CPU>& f) const;

  // Wrapper to MPI_Reduce.
  template <typename Scalar, class Domain>
  void localSum(func::function<Scalar, Domain>& f, int root_id) const;

  // Delay the execution of sum (implemented with MPI_Allreduce) until 'resolveSums' is called,
  // or 'delayedSum' is called with an object of different Scalar type.
  template <typename Scalar>
  void delayedSum(Scalar& obj);
  template <typename Scalar>
@@ -69,6 +73,7 @@ public:
  template <typename Scalar, class domain>
  void delayedSum(func::function<Scalar, domain>& f);

  // Execute all the reductions scheduled with 'delayedSum' or delayed leaveOneOutSum out calls.
  void resolveSums();

  template <typename some_type>
@@ -87,11 +92,13 @@ public:
  // in s.
  // Does nothing, if there is only one rank.
  // In/Out: s
  // In: delay. If true delay the sum until 'resolveSums' is called.
  template <typename T>
  void leaveOneOutSum(T& s, bool delay = 0);

  // Element-wise implementations for dca::func::function.
  // In/Out: f
  // In: delay. If true delay the sum until 'resolveSums' is called.
  template <typename Scalar, class Domain>
  void leaveOneOutSum(func::function<Scalar, Domain>& f, bool delay = 0);

@@ -561,7 +568,7 @@ std::vector<Scalar> MPICollectiveSum::avgNormalizedMomenta(const func::function<

template <typename T>
void MPICollectiveSum::sum(const T* in, T* out, std::size_t n, int id) const {
  // On summit large messages hangs if sizeof(floating point type) type * message_size > 2^31-1.
  // On summit large messages hangs if sizeof(floating point type) * message_size > 2^31-1.
  constexpr std::size_t max_size = dca::util::IsComplex<T>::value
                                       ? 2 * (std::numeric_limits<int>::max() / sizeof(T))
                                       : std::numeric_limits<int>::max() / sizeof(T);
+15 −4
Original line number Diff line number Diff line
@@ -25,6 +25,8 @@
#include "dca/parallel/mpi_concurrency/mpi_collective_min.hpp"
#include "dca/parallel/mpi_concurrency/mpi_collective_sum.hpp"
#include "dca/parallel/mpi_concurrency/mpi_initializer.hpp"
#include "dca/parallel/mpi_concurrency/mpi_gang.hpp"
#include "dca/parallel/mpi_concurrency/mpi_gather.hpp"
#include "dca/parallel/mpi_concurrency/mpi_packing.hpp"
#include "dca/parallel/mpi_concurrency/mpi_processor_grouping.hpp"
#include "dca/parallel/util/get_bounds.hpp"
@@ -34,12 +36,15 @@ namespace parallel {
// dca::parallel::

class MPIConcurrency final : public virtual MPIInitializer,
                             private virtual MPIProcessorGrouping,
                             public virtual MPIProcessorGrouping,
                             public MPIPacking,
                             public MPICollectiveMax,
                             public MPICollectiveMin,
                             public MPICollectiveSum {
                             public MPICollectiveSum,
                             public MPIGather {
public:
  using Gang = MPIGang;

  MPIConcurrency(int argc, char** argv);

  inline int id() const {
@@ -68,6 +73,12 @@ public:
    return util::getBounds(id(), number_of_processors(), dmn);
  }

  // Using gather with no gang uses the entire concurrency.
//  template <class Scalar, class DmnIn, class DmnOut>
//  void gather(const func::function<Scalar, DmnIn>& f_in, func::function<Scalar, DmnOut>& f_out) const {
//    gather(f_in, f_out, *this);
//  }

  friend std::ostream& operator<<(std::ostream& some_ostream, const MPIConcurrency& this_concurrency);

private:
@@ -147,7 +158,7 @@ bool MPIConcurrency::broadcast_object(object_type& object, int root_id) const {
  return true;
}

}  // parallel
}  // dca
}  // namespace parallel
}  // namespace dca

#endif  // DCA_PARALLEL_MPI_CONCURRENCY_MPI_CONCURRENCY_HPP
+50 −0
Original line number Diff line number Diff line
// Copyright (C) 2018 ETH Zurich
// Copyright (C) 2018 UT-Battelle, LLC
// All rights reserved.
//
// See LICENSE for terms of usage.
// See CITATION.md for citation guidelines, if DCA++ is used for scientific publications.
//
// Author: Giovanni Balduzzi (gbalduzz@itp.phys.ethz.ch)
//
// This class represent a subgroup of MPIProcessorGrouping.

#ifndef DCA_PARALLEL_MPI_CONCURRENCY_MPI_GANG_HPP
#define DCA_PARALLEL_MPI_CONCURRENCY_MPI_GANG_HPP

#include <mpi.h>

#include "dca/parallel/mpi_concurrency/mpi_processor_grouping.hpp"

namespace dca {
namespace parallel {
// dca::parallel::

class MPIGang {
public:
  MPIGang(const MPIProcessorGrouping& group, int min_size);
  ~MPIGang();

  MPIGang(const MPIGang& other) = delete;

  int get_id() const {
    return id_;
  }
  int get_size() const {
    return size_;
  }

  auto get() const {
    return communicator_;
  }

private:
  MPI_Comm communicator_;
  int id_;
  int size_;
};

}  // namespace parallel
}  // namespace dca

#endif  // DCA_PARALLEL_MPI_CONCURRENCY_MPI_GANG_HPP
+67 −0
Original line number Diff line number Diff line
// Copyright (C) 2018 ETH Zurich
// Copyright (C) 2018 UT-Battelle, LLC
// All rights reserved.
//
// See LICENSE for terms of usage.
// See CITATION.md for citation guidelines, if DCA++ is used for scientific publications.
//
// Author: Giovanni Balduzzi (gbalduzz@itp.phys.ethz.ch)
//
// This class provides an interface to gather functions with MPI.

#ifndef DCA_PARALLEL_MPI_CONCURRENCY_MPI_GATHER_HPP
#define DCA_PARALLEL_MPI_CONCURRENCY_MPI_GATHER_HPP

#include <vector>

#include <mpi.h>

#include "dca/function/domains.hpp"
#include "dca/function/function.hpp"
#include "dca/parallel/mpi_concurrency/mpi_gang.hpp"
#include "dca/parallel/mpi_concurrency/mpi_type_map.hpp"
#include "dca/util/integer_division.hpp"

namespace dca {
namespace parallel {
// dca::parallel::

class MPIGather {
public:
  MPIGather() = default;

  // Gather the function 'f_in' on all processes in 'gang' and copy the result into 'f_out',
  // discarding eventual padding.
  // Precondition: gang.get_size() * f_in.size() >= f_out.size()
  template <class Scalar, class DmnIn, class DmnOut, class Gang>
  void gather(const func::function<Scalar, DmnIn>& f_in, func::function<Scalar, DmnOut>& f_out,
              const Gang& gang) const;

private:
  template <class T, class Gang>
  void gather(const T* in, T* out, int local_n, const Gang& gang) const;
};

template <class Scalar, class DmnIn, class DmnOut, class Gang>
void MPIGather::gather(const func::function<Scalar, DmnIn>& f_in,
                       func::function<Scalar, DmnOut>& f_out, const Gang& gang) const {
  std::vector<Scalar> gathered(f_in.size() * gang.get_size());
  gather(f_in.values(), gathered.data(), f_in.size(), gang);

  if (f_out.size() > gathered.size())
    throw(std::logic_error("Output function is too large."));

  // TODO: move.
  std::copy_n(gathered.data(), f_out.size(), f_out.values());
}

template <class T, class Gang>
void MPIGather::gather(const T* in, T* out, int local_n, const Gang& gang) const {
  MPI_Allgather(in, local_n, MPITypeMap<T>::value(), out, local_n, MPITypeMap<T>::value(),
                gang.get());
}

}  // namespace parallel
}  // namespace dca

#endif  // DCA_PARALLEL_MPI_CONCURRENCY_MPI_GATHER_HPP
Loading