Unverified Commit a3932a44 authored by Peter Doak's avatar Peter Doak Committed by GitHub
Browse files

Merge pull request #98 from CompFUSE/tweak_accum_memory2

Tweak accum memory2
parents 65b2da89 4245407c
Loading
Loading
Loading
Loading
+376 −0
Original line number Diff line number Diff line
// Copyright (C) 2018 ETH Zurich
// Copyright (C) 2018 UT-Battelle, LLC
// All rights reserved.
//
// See LICENSE for terms of usage.
// See CITATION.md for citation guidelines, if DCA++ is used for scientific publications.
//
// Author: Giovanni Balduzzi (gbalduzz@itp.phys.ethz.ch)
//
// This file provides a matrix with a more efficient reshaping between different rectangular shapes
//  of similar total size.

#pragma once

#include <cassert>
#include <cmath>
#include <sstream>
#include <iostream>
#include <stdexcept>
#include <string>
#include <type_traits>
#include <utility>

#include "dca/linalg/util/allocators/allocators.hpp"
#include "dca/linalg/device_type.hpp"
#include "dca/linalg/util/copy.hpp"
#include "dca/linalg/util/stream_functions.hpp"

namespace dca {
namespace linalg {
// dca::linalg::

template <typename ScalarType, DeviceType device_name,
          class Allocator = util::DefaultAllocator<ScalarType, device_name>>
class ReshapableMatrix : public Allocator {
public:
  using ThisType = ReshapableMatrix<ScalarType, device_name, Allocator>;
  using ValueType = ScalarType;

  // Default contructor creates a matrix of zero size and capacity.
  ReshapableMatrix() = default;
  // Initializes a square size x size matrix.
  ReshapableMatrix(int size);
  // Initializes a square size.first x size.second matrix.
  ReshapableMatrix(std::pair<int, int> size);

  // Contructs a matrix with size rhs.size() and a copy of the elements of rhs.
  ReshapableMatrix(const ThisType& rhs);
  template <DeviceType rhs_device_name, class AllocatorRhs>
  ReshapableMatrix(const ReshapableMatrix<ScalarType, rhs_device_name, AllocatorRhs>& rhs);

  // Constructs a matrix with size rhs.size(). The elements of rhs are moved.
  ReshapableMatrix(ThisType&& rhs);

  // Resize the matrix to rhs.size() and copies the elements.
  ReshapableMatrix& operator=(const ThisType& rhs);
  template <DeviceType rhs_device_name, class AllocatorRhs>
  ReshapableMatrix& operator=(const ReshapableMatrix<ScalarType, rhs_device_name, AllocatorRhs>& rhs);

  // Moves the elements of rhs into this matrix.
  ReshapableMatrix& operator=(ThisType&& rhs);

  ~ReshapableMatrix();

  // Returns true if this is equal to other, false otherwise.
  // Two matrices are equal, if they have the same size and contain the same elements. Name and
  // capacity are ignored.
  // Special case: two matrices without elements are equal.
  bool operator==(const ReshapableMatrix<ScalarType, device_name, Allocator>& other) const;

  // Returns true if this is not equal to other, false otherwise.
  // See description of operator== for the definition of equality.
  bool operator!=(const ReshapableMatrix<ScalarType, device_name, Allocator>& other) const;

  // Returns the (i,j)-th element of the matrix.
  // Preconditions: 0 <= i < size().first, 0 <= j < size().second.
  // This method is available only if device_name == CPU.
  template <DeviceType dn = device_name, typename = std::enable_if_t<dn == CPU>>
  ScalarType& operator()(int i, int j) {
    assert(i >= 0 && i < size_.first);
    assert(j >= 0 && j < size_.second);
    return data_[i + j * leadingDimension()];
  }
  template <DeviceType dn = device_name, typename = std::enable_if_t<dn == CPU>>
  const ScalarType& operator()(int i, int j) const {
    assert(i >= 0 && i < size_.first);
    assert(j >= 0 && j < size_.second);
    return data_[i + j * leadingDimension()];
  }

  // Returns the pointer to the (0,0)-th element.
  ValueType* ptr() {
    return data_;
  }
  const ValueType* ptr() const {
    return data_;
  }

  // Returns the pointer to the (i,j)-th element i < size().first and 0 < j < size().second, or
  // a pointer past the end of the range if i == size().first or j == size().second.
  // Preconditions: 0 <= i <= size().first, 0 <= j <= size().second.
  ValueType* ptr(int i, int j) {
    assert(i >= 0 && i <= size_.first);
    assert(j >= 0 && j <= size_.second);
    return data_ + i + j * leadingDimension();
  }
  const ValueType* ptr(int i, int j) const {
    assert(i >= 0 && i <= size_.first);
    assert(j >= 0 && j <= size_.second);
    return data_ + i + j * leadingDimension();
  }

  const std::pair<int, int> size() const {
    return size_;
  }
  std::size_t capacity() const {
    return capacity_;
  }
  int nrRows() const {
    return size_.first;
  }
  int nrCols() const {
    return size_.second;
  }
  int leadingDimension() const {
    return size_.first;
  }

  // Resizes *this to a (new_size.first * new_size.second) matrix.
  // The previous elements are not copied, therefore all the elements
  // may have any value after the call to this method.
  // Returns: true if reallocation took place.
  bool resizeNoCopy(std::pair<int, int> new_size);
  // Resizes *this to a (new_size * new_size) matrix. See previous method for details.
  bool resizeNoCopy(int new_size) {
    return resizeNoCopy(std::make_pair(new_size, new_size));
  }

  // Reserves the space for at least (new_size.first * new_size.second) elements without changing
  // the matrix size. The value of the matrix elements is undefined after calling this method.
  // Returns: true if reallocation took place.
  bool reserveNoCopy(std::size_t new_size);

  void swap(ReshapableMatrix<ScalarType, device_name, Allocator>& other);

  // Releases the memory allocated by *this and sets size and capacity to zero.
  void clear();

#ifdef DCA_HAVE_CUDA
  // Asynchronous assignment.
  template <DeviceType rhs_device_name>
  void setAsync(const ReshapableMatrix<ScalarType, rhs_device_name>& rhs, cudaStream_t stream);

  // Asynchronous assignment (copy with stream = getStream(thread_id, stream_id))
  template <DeviceType rhs_device_name>
  void setAsync(const ReshapableMatrix<ScalarType, rhs_device_name>& rhs, int thread_id,
                int stream_id);

  void setToZero(cudaStream_t stream);
#else
  // Synchronous assignment fallback for SetAsync.
  template <DeviceType rhs_device_name>
  void setAsync(const ReshapableMatrix<ScalarType, rhs_device_name>& rhs, int /*thread_id*/,
                int /*stream_id*/);

#endif  // DCA_HAVE_CUDA

  // Returns the allocated device memory in bytes.
  std::size_t deviceFingerprint() const;

private:
  static std::size_t nextCapacity(std::size_t size);
  inline static size_t nrElements(std::pair<int, int> size) {
    return static_cast<size_t>(size.first) * static_cast<size_t>(size.second);
  }

  std::pair<int, int> size_ = std::make_pair(0, 0);
  std::size_t capacity_ = 0;

  ValueType* data_ = nullptr;

  template <class ScalarType2, DeviceType device_name2, class Allocator2>
  friend class dca::linalg::ReshapableMatrix;
};

template <typename ScalarType, DeviceType device_name, class Allocator>
ReshapableMatrix<ScalarType, device_name, Allocator>::ReshapableMatrix(int size)
    : ReshapableMatrix(std::make_pair(size, size)) {}

template <typename ScalarType, DeviceType device_name, class Allocator>
ReshapableMatrix<ScalarType, device_name, Allocator>::ReshapableMatrix(std::pair<int, int> size)
    : size_(size), capacity_(nextCapacity(nrElements(size))) {
  assert(size_.first >= 0 && size_.second >= 0);
  assert(capacity_ >= nrElements(size_));

  data_ = Allocator::allocate(capacity_);
}

template <typename ScalarType, DeviceType device_name, class Allocator>
ReshapableMatrix<ScalarType, device_name, Allocator>::ReshapableMatrix(const ThisType& rhs) {
  *this = rhs;
}

template <typename ScalarType, DeviceType device_name, class Allocator>
template <DeviceType rhs_device_name, class AllocatorRhs>
ReshapableMatrix<ScalarType, device_name, Allocator>::ReshapableMatrix(
    const ReshapableMatrix<ScalarType, rhs_device_name, AllocatorRhs>& rhs) {
  *this = rhs;
}

template <typename ScalarType, DeviceType device_name, class Allocator>
ReshapableMatrix<ScalarType, device_name, Allocator>::ReshapableMatrix(
    ReshapableMatrix<ScalarType, device_name, Allocator>&& rhs)
    : ReshapableMatrix<ScalarType, device_name, Allocator>() {
  swap(rhs);
}

template <typename ScalarType, DeviceType device_name, class Allocator>
ReshapableMatrix<ScalarType, device_name, Allocator>& ReshapableMatrix<
    ScalarType, device_name, Allocator>::operator=(const ThisType& rhs) {
  size_ = rhs.size_;
  capacity_ = rhs.capacity_;

  Allocator::deallocate(data_);
  data_ = Allocator::allocate(capacity_);
  util::memoryCopy(data_, leadingDimension(), rhs.data_, rhs.leadingDimension(), size_);
  return *this;
}

template <typename ScalarType, DeviceType device_name, class Allocator>
template <DeviceType rhs_device_name, class AllocatorRhs>
ReshapableMatrix<ScalarType, device_name, Allocator>& ReshapableMatrix<
    ScalarType, device_name,
    Allocator>::operator=(const ReshapableMatrix<ScalarType, rhs_device_name, AllocatorRhs>& rhs) {
  size_ = rhs.size_;
  capacity_ = rhs.capacity_;

  Allocator::deallocate(data_);
  data_ = Allocator::allocate(capacity_);
  util::memoryCopy(data_, leadingDimension(), rhs.data_, rhs.leadingDimension(), size_);
  return *this;
}

template <typename ScalarType, DeviceType device_name, class Allocator>
ReshapableMatrix<ScalarType, device_name, Allocator>& ReshapableMatrix<
    ScalarType, device_name, Allocator>::operator=(ThisType&& rhs) {
  swap(rhs);
  return *this;
}

template <typename ScalarType, DeviceType device_name, class Allocator>
ReshapableMatrix<ScalarType, device_name, Allocator>::~ReshapableMatrix() {
  Allocator::deallocate(data_);
}

template <typename ScalarType, DeviceType device_name, class Allocator>
bool ReshapableMatrix<ScalarType, device_name, Allocator>::operator==(
    const ReshapableMatrix<ScalarType, device_name, Allocator>& other) const {
  if (device_name == GPU)
    return ReshapableMatrix<ScalarType, CPU>(*this) == ReshapableMatrix<ScalarType, CPU>(other);

  if (size() != other.size())
    return nrRows() * nrCols() == 0 and other.nrRows() * other.nrCols() == 0;

  for (int j = 0; j < nrCols(); ++j)
    for (int i = 0; i < nrRows(); ++i)
      if ((*this)(i, j) != other(i, j))
        return false;

  return true;
}

template <typename ScalarType, DeviceType device_name, class Allocator>
bool ReshapableMatrix<ScalarType, device_name, Allocator>::operator!=(
    const ReshapableMatrix<ScalarType, device_name, Allocator>& other) const {
  return !(*this == other);
}

template <typename ScalarType, DeviceType device_name, class Allocator>
bool ReshapableMatrix<ScalarType, device_name, Allocator>::resizeNoCopy(
    const std::pair<int, int> new_size) {
  const bool realloc = reserveNoCopy(nrElements(new_size));
  size_ = new_size;
  assert(capacity_ >= nrElements(size_));
  return realloc;
}

template <typename ScalarType, DeviceType device_name, class Allocator>
bool ReshapableMatrix<ScalarType, device_name, Allocator>::reserveNoCopy(std::size_t new_size) {
  if (new_size > capacity_) {
    Allocator::deallocate(data_);
    capacity_ = nextCapacity(new_size);
    data_ = Allocator::allocate(capacity_);
    return true;
  }
  return false;
}

template <typename ScalarType, DeviceType device_name, class Allocator>
void ReshapableMatrix<ScalarType, device_name, Allocator>::swap(
    ReshapableMatrix<ScalarType, device_name, Allocator>& other) {
  std::swap(size_, other.size_);
  std::swap(capacity_, other.capacity_);
  std::swap(data_, other.data_);
}

template <typename ScalarType, DeviceType device_name, class Allocator>
void ReshapableMatrix<ScalarType, device_name, Allocator>::clear() {
  Allocator::deallocate(data_);
  size_ = std::make_pair(0, 0);
  capacity_ = 0;
}

#ifdef DCA_HAVE_CUDA

template <typename ScalarType, DeviceType device_name, class Allocator>
template <DeviceType rhs_device_name>
void ReshapableMatrix<ScalarType, device_name, Allocator>::setAsync(
    const ReshapableMatrix<ScalarType, rhs_device_name>& rhs, const cudaStream_t stream) {
  resizeNoCopy(rhs.size_);
  util::memoryCopyAsync(data_, leadingDimension(), rhs.data_, rhs.leadingDimension(), size_, stream);
}

template <typename ScalarType, DeviceType device_name, class Allocator>
template <DeviceType rhs_device_name>
void ReshapableMatrix<ScalarType, device_name, Allocator>::setAsync(
    const ReshapableMatrix<ScalarType, rhs_device_name>& rhs, const int thread_id,
    const int stream_id) {
  setAsync(rhs, util::getStream(thread_id, stream_id));
}

template <typename ScalarType, DeviceType device_name, class Allocator>
void ReshapableMatrix<ScalarType, device_name, Allocator>::setToZero(cudaStream_t stream) {
  cudaMemsetAsync(data_, 0, leadingDimension() * nrCols() * sizeof(ScalarType), stream);
}

#else  // DCA_HAVE_CUDA

template <typename ScalarType, DeviceType device_name, class Allocator>
template <DeviceType rhs_device_name>
void ReshapableMatrix<ScalarType, device_name, Allocator>::setAsync(
    const ReshapableMatrix<ScalarType, rhs_device_name>& rhs, int /*thread_id*/, int /*stream_id*/) {
  *this = rhs;
}

#endif  // DCA_HAVE_CUDA

template <typename ScalarType, DeviceType device_name, class Allocator>
std::size_t ReshapableMatrix<ScalarType, device_name, Allocator>::nextCapacity(const std::size_t size) {
  assert(size >= 0);
  constexpr std::size_t block_size = 512;

  auto next_power_of_two = [](std::size_t x) {
    if (!x)
      return std::size_t(0);
    std::size_t result = 1;
    while (result < x) {
      result <<= 1;
    }
    return result;
  };

  return size <= block_size ? next_power_of_two(size)
                            : (size + block_size - 1) / block_size * block_size;
}

template <typename ScalarType, DeviceType device_name, class Allocator>
std::size_t ReshapableMatrix<ScalarType, device_name, Allocator>::deviceFingerprint() const {
  if (device_name == GPU)
    return capacity_ * sizeof(ScalarType);
  else
    return 0;
}

}  // linalg
}  // dca
+3 −0
Original line number Diff line number Diff line
@@ -21,6 +21,9 @@ template <typename T>
class AlignedAllocator {
protected:
  T* allocate(std::size_t n) {
    if (!n)
      return nullptr;

    T* ptr;
    int err = posix_memalign((void**)&ptr, 128, n * sizeof(T));
    if (err)
+33 −16
Original line number Diff line number Diff line
@@ -19,6 +19,10 @@

#include "dca/math/function_transform/special_transforms/space_transform_2D_gpu.hpp"

#include <array>
#include <memory>

#include "dca/linalg/reshapable_matrix.hpp"
#include "dca/linalg/util/magma_batched_gemm.hpp"
#include "dca/math/function_transform/special_transforms/kernels_interface.hpp"
#include "dca/phys/domains/quantum/electron_band_domain.hpp"
@@ -33,6 +37,7 @@ class SpaceTransform2DGpu : private SpaceTransform2D<RDmn, KDmn, Real> {
private:
  using Complex = std::complex<Real>;
  using MatrixDev = linalg::Matrix<Complex, linalg::GPU>;
  using RMatrix = linalg::ReshapableMatrix<Complex, linalg::GPU>;

public:
  // Constructor
@@ -44,7 +49,11 @@ public:
  // order of M's labels from (r, b, w) to (b, r, w).
  // The transform is equivalent to M(k1, k2) = \sum_{r1, r2} exp(i(k1 * r1 - k2 * r2)) M(r1, r2)
  // In/Out: M
  void execute(MatrixDev& M);
  void execute(RMatrix& M);

  void setWorkspace(const std::shared_ptr<RMatrix>& workspace) {
    workspace_ = workspace;
  }

  // Returns: the stream associated with the magma queue.
  cudaStream_t get_stream() const {
@@ -52,7 +61,11 @@ public:
  }

  std::size_t deviceFingerprint() const {
    return T_times_M_.deviceFingerprint() + T_times_M_times_T_.deviceFingerprint();
    std::size_t res(0);

      if (workspace_.unique())
        res += workspace_->deviceFingerprint();
    return res;
  }

private:
@@ -60,7 +73,7 @@ private:
  using BDmn = func::dmn_0<phys::domains::electron_band_domain>;

  static const MatrixDev& get_T_matrix();
  void rearrangeResult(const MatrixDev& in, MatrixDev& out);
  void rearrangeResult(const RMatrix& in, RMatrix& out);

  const int n_bands_;
  const int nw_;
@@ -69,8 +82,7 @@ private:
  magma_queue_t queue_;
  cudaStream_t stream_;

  MatrixDev T_times_M_;
  MatrixDev T_times_M_times_T_;
  std::shared_ptr<RMatrix> workspace_;

  linalg::util::MagmaBatchedGemm<Complex> plan1_;
  linalg::util::MagmaBatchedGemm<Complex> plan2_;
@@ -84,11 +96,16 @@ SpaceTransform2DGpu<RDmn, KDmn, Real>::SpaceTransform2DGpu(const int nw_pos, mag
      queue_(queue),
      stream_(magma_queue_get_cuda_stream(queue_)),
      plan1_(queue_),
      plan2_(queue_) {}
      plan2_(queue_) {
  workspace_ = std::make_shared<RMatrix>();
}

template <class RDmn, class KDmn, typename Real>
void SpaceTransform2DGpu<RDmn, KDmn, Real>::execute(MatrixDev& M) {
  T_times_M_.resizeNoCopy(M.size());
void SpaceTransform2DGpu<RDmn, KDmn, Real>::execute(RMatrix& M) {
  auto& T_times_M = *(workspace_);
  auto& T_times_M_times_T = M;

  T_times_M.resizeNoCopy(M.size());
  const auto& T = get_T_matrix();

  {
@@ -96,33 +113,33 @@ void SpaceTransform2DGpu<RDmn, KDmn, Real>::execute(MatrixDev& M) {

    plan1_.synchronizeCopy();
    for (int i = 0; i < n_trafo; ++i)
      plan1_.addGemm(T.ptr(), M.ptr(i * nc_, 0), T_times_M_.ptr(i * nc_, 0));
      plan1_.addGemm(T.ptr(), M.ptr(i * nc_, 0), T_times_M.ptr(i * nc_, 0));

    const int lda = T.leadingDimension();
    const int ldb = M.leadingDimension();
    const int ldc = T_times_M_.leadingDimension();
    const int ldc = T_times_M.leadingDimension();
    plan1_.execute('N', 'N', nc_, M.nrCols(), nc_, Complex(1), Complex(0), lda, ldb, ldc);
  }

  T_times_M_times_T_.resizeNoCopy(M.size());
  {
    const int n_trafo = M.nrCols() / nc_;
    plan2_.synchronizeCopy();
    for (int j = 0; j < n_trafo; ++j)
      plan2_.addGemm(T_times_M_.ptr(0, j * nc_), T.ptr(), T_times_M_times_T_.ptr(0, j * nc_));
      plan2_.addGemm(T_times_M.ptr(0, j * nc_), T.ptr(), T_times_M_times_T.ptr(0, j * nc_));

    const int lda = T_times_M_.leadingDimension();
    const int lda = T_times_M.leadingDimension();
    const int ldb = T.leadingDimension();
    const int ldc = T_times_M_times_T_.leadingDimension();
    const int ldc = T_times_M_times_T.leadingDimension();
    const Complex norm(1. / nc_);
    plan2_.execute('N', 'C', M.nrRows(), nc_, nc_, norm, Complex(0), lda, ldb, ldc);
  }

  rearrangeResult(T_times_M_times_T_, M);
  rearrangeResult(T_times_M_times_T, *workspace_);
  M.swap(*workspace_);
}

template <class RDmn, class KDmn, typename Real>
void SpaceTransform2DGpu<RDmn, KDmn, Real>::rearrangeResult(const MatrixDev& in, MatrixDev& out) {
void SpaceTransform2DGpu<RDmn, KDmn, Real>::rearrangeResult(const RMatrix& in, RMatrix& out) {
  out.resizeNoCopy(in.size());
  details::rearrangeResult(in.ptr(), in.leadingDimension(), out.ptr(), out.leadingDimension(),
                           n_bands_, nc_, nw_, stream_);
+63 −32

File changed.

Preview size limit exceeded, changes collapsed.

+19 −2
Original line number Diff line number Diff line
@@ -22,6 +22,7 @@

#include "dca/config/accumulation_options.hpp"
#include "dca/linalg/lapack/magma.hpp"
#include "dca/linalg/reshapable_matrix.hpp"
#include "dca/linalg/util/cuda_event.hpp"
#include "dca/linalg/util/magma_queue.hpp"
#include "dca/math/function_transform/special_transforms/space_transform_2D_gpu.hpp"
@@ -97,6 +98,8 @@ public:

  std::size_t deviceFingerprint() const {
    std::size_t res(0);
    for (const auto& work : workspaces_)
      res += work->deviceFingerprint();
    for (int s = 0; s < 2; ++s)
      res += ndft_objs_[s].deviceFingerprint() + G_[s].deviceFingerprint() +
             space_trsf_objs_[s].deviceFingerprint();
@@ -122,6 +125,8 @@ private:
  using typename BaseClass::TpGreenFunction;
  using typename BaseClass::Complex;

  using Matrix = linalg::Matrix<Complex, linalg::GPU>;

  void initializeG0();
  void resetG4();
  void initializeG4Helpers() const;
@@ -155,19 +160,21 @@ private:
  using BaseClass::n_pos_frqs_;

  using MatrixDev = linalg::Matrix<Complex, linalg::GPU>;
  using RMatrix = linalg::ReshapableMatrix<Complex, linalg::GPU>;
  using MatrixHost = linalg::Matrix<Complex, linalg::CPU>;

  std::array<linalg::util::MagmaQueue, 2> queues_;
  std::array<cudaStream_t, 2> streams_;
  linalg::util::CudaEvent event_;

  std::vector<std::shared_ptr<RMatrix>> workspaces_;

  using NdftType = CachedNdft<Real, RDmn, WTpExtDmn, WTpExtPosDmn, linalg::GPU, non_density_density_>;
  std::array<NdftType, 2> ndft_objs_;
  using DftType = math::transform::SpaceTransform2DGpu<RDmn, KDmn, Real>;
  std::array<DftType, 2> space_trsf_objs_;

  std::array<MatrixDev, 2> G_;
  std::array<MatrixHost, 2> G_matrix_host_;
  std::array<RMatrix, 2> G_;

  bool finalized_ = false;
  bool initialized_ = false;
@@ -188,6 +195,16 @@ TpAccumulator<Parameters, linalg::GPU>::TpAccumulator(
      ndft_objs_{NdftType(queues_[0]), NdftType(queues_[1])},
      space_trsf_objs_{DftType(n_pos_frqs_, queues_[0]), DftType(n_pos_frqs_, queues_[1])} {
  initializeG4Helpers();

  // Create shared workspaces.
  workspaces_.resize(n_ndft_streams_);
  for (auto& work : workspaces_)
    work = std::make_shared<RMatrix>();

  for (int i = 0; i < n_ndft_streams_; ++i) {
    ndft_objs_[i].setWorkspace(workspaces_[i]);
    space_trsf_objs_[i].setWorkspace(workspaces_[i]);
  }
}

template <class Parameters>
Loading