Unverified Commit 7b6f95c4 authored by Peter Doak's avatar Peter Doak Committed by GitHub
Browse files

Merge pull request #169 from gbalduzz/generic_container

Generic container for AoS of multiple types
parents d834e02a 6c2ca2d7
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -190,7 +190,7 @@ public:
  // Enable only if all arguments are integral to prevent subind_to_linind(int*, int) to resolve to
  // subind_to_linind(int...) rather than subind_to_linind(const int* const, int).
  template <typename... Ts>
  std::enable_if_t<util::if_all<std::is_integral<Ts>::value...>::value, int> subind_2_linind(
  std::enable_if_t<util::ifAll(std::is_integral_v<Ts>...), int> subind_2_linind(
      const Ts... subindices) const {
    // We need to cast all subindices to the same type for dmn_variadic.
    return dmn(static_cast<int>(subindices)...);
+18 −24
Original line number Diff line number Diff line
@@ -194,22 +194,20 @@ public:
  // Swaps the contents of the matrix, included the name, with those of rhs.
  void swapWithName(Matrix<ScalarType, device_name>& rhs);

#ifdef DCA_HAVE_CUDA
  // Asynchronous assignment.
  // Asynchronous assignment (copy with stream = getStream(thread_id, stream_id))
  // + synchronization of stream
  template <DeviceType rhs_device_name>
  void setAsync(const Matrix<ScalarType, rhs_device_name>& rhs, cudaStream_t stream);
  void set(const Matrix<ScalarType, rhs_device_name>& rhs, int thread_id, int stream_id);

  // Asynchronous assignment (copy with stream = getStream(thread_id, stream_id))
  // Asynchronous assignment.
  template <DeviceType rhs_device_name>
  void setAsync(const Matrix<ScalarType, rhs_device_name>& rhs, int thread_id, int stream_id);
  void setAsync(const Matrix<ScalarType, rhs_device_name>& rhs, const util::CudaStream& stream);

  void setToZero(cudaStream_t stream);
#else
  // Synchronous assignment fallback for SetAsync.
  // Asynchronous assignment (copy with stream = getStream(thread_id, stream_id))
  template <DeviceType rhs_device_name>
  void setAsync(const Matrix<ScalarType, rhs_device_name>& rhs, int thread_id, int stream_id);

#endif  // DCA_HAVE_CUDA
  void setToZero(const util::CudaStream& stream);

  // Prints the values of the matrix elements.
  void print() const;
@@ -413,12 +411,19 @@ void Matrix<ScalarType, device_name>::swapWithName(Matrix<ScalarType, device_nam
  swap(rhs);
}

#ifdef DCA_HAVE_CUDA
template <typename ScalarType, DeviceType device_name>
template <DeviceType rhs_device_name>
void Matrix<ScalarType, device_name>::set(const Matrix<ScalarType, rhs_device_name>& rhs,
                                          int thread_id, int stream_id) {
  resize(rhs.size_);
  util::memoryCopy(data_, leadingDimension(), rhs.data_, rhs.leadingDimension(), size_, thread_id,
                   stream_id);
}

template <typename ScalarType, DeviceType device_name>
template <DeviceType rhs_device_name>
void Matrix<ScalarType, device_name>::setAsync(const Matrix<ScalarType, rhs_device_name>& rhs,
                                               const cudaStream_t stream) {
                                               const util::CudaStream& stream) {
  resizeNoCopy(rhs.size_);
  util::memoryCopyAsync(data_, leadingDimension(), rhs.data_, rhs.leadingDimension(), size_, stream);
}
@@ -431,21 +436,10 @@ void Matrix<ScalarType, device_name>::setAsync(const Matrix<ScalarType, rhs_devi
}

template <typename ScalarType, DeviceType device_name>
void Matrix<ScalarType, device_name>::setToZero(cudaStream_t stream) {
  cudaMemsetAsync(data_, 0, leadingDimension() * nrCols() * sizeof(ScalarType), stream);
}

#else

template <typename ScalarType, DeviceType device_name>
template <DeviceType rhs_device_name>
void Matrix<ScalarType, device_name>::setAsync(const Matrix<ScalarType, rhs_device_name>& rhs,
                                               int /*thread_id*/, int /*stream_id*/) {
  set(rhs);
void Matrix<ScalarType, device_name>::setToZero(const util::CudaStream& stream) {
  util::Memory<device_name>::setToZeroAsync(data_, leadingDimension() * nrCols(), stream);
}

#endif  // DCA_HAVE_CUDA

template <typename ScalarType, DeviceType device_name>
void Matrix<ScalarType, device_name>::print() const {
  if (device_name == GPU)
+104 −0
Original line number Diff line number Diff line
// Copyright (C) 2018 ETH Zurich
// Copyright (C) 2018 UT-Battelle, LLC
// All rights reserved.
//
// See LICENSE for terms of usage.
// See CITATION.md for citation guidelines, if DCA++ is used for scientific publications.
//
// Author: Giovanni Balduzzi (gbalduzz@itp.phys.ethz.ch)
//
// This class represents an AoS where each array has the same length but arbitrary type.

#ifndef DCA_LINALG_MULTI_VECTOR_HPP
#define DCA_LINALG_MULTI_VECTOR_HPP

#include "dca/linalg/vector.hpp"
#include "dca/linalg/util/cuda_stream.hpp"
#include "dca/util/type_list.hpp"
#include "dca/util/pack_operations.hpp"

namespace dca {
namespace linalg {
// dca::linalg::

template <DeviceType device, typename... Ts>
class MultiVector {
public:
  using Types = dca::util::Typelist<Ts...>;
  template <unsigned id>
  using Type = typename dca::util::TypeAt<id, Types>::type;

  // Initialize each sub-array with size n.
  MultiVector(std::size_t n = 0);

  // Resize the container so that each sub-array has size n, invalidating references and values.
  void resizeNoCopy(std::size_t n);

  // Copy the values of rhs asynchronously.
  template <DeviceType other_device>
  void setAsync(const MultiVector<other_device, Ts...>& rhs, const linalg::util::CudaStream& stream) {
    size_ = rhs.size_;
    data_.setAsync(rhs.data_, stream);
  }

  // Returns a pointer to the beginning of the id-th array
  // Preconditions: 0 <= id < length(Ts...).
  template <unsigned id>
  auto get() -> Type<id>*;
  template <unsigned id>
  auto get() const -> const Type<id>*;

  std::size_t size() const {
    return size_;
  }

  // Allows setAsync to access the data on another device.
  template <DeviceType other_device, typename... T2s>
  friend class MultiVector;

private:
  template <unsigned id>
  std::size_t offset() const;

  Vector<unsigned char, device> data_;
  std::size_t size_;
};

template <DeviceType device, typename... Ts>
MultiVector<device, Ts...>::MultiVector(std::size_t n) {
  resizeNoCopy(n);
}

template <DeviceType device, typename... Ts>
void MultiVector<device, Ts...>::resizeNoCopy(std::size_t n) {
  data_.resizeNoCopy(n * dca::util::size_sum<Ts...>);
  size_ = n;
}

template <DeviceType device, typename... Ts>
template <unsigned id>
auto MultiVector<device, Ts...>::get() -> Type<id>* {
  unsigned char* ptr = data_.ptr() + offset<id>();
  return reinterpret_cast<Type<id>*>(ptr);
}

template <DeviceType device, typename... Ts>
template <unsigned id>
auto MultiVector<device, Ts...>::get() const -> const Type<id>* {
  const unsigned char* ptr = data_.ptr() + offset<id>();
  return reinterpret_cast<const Type<id>*>(ptr);
}

template <DeviceType device, typename... Ts>
template <unsigned id>
std::size_t MultiVector<device, Ts...>::offset() const {
  static_assert(id < sizeof...(Ts), "Invalid sub-array id.");

  constexpr unsigned size_t_sum = dca::util::size_sum<dca::util::Sublist<id, Ts...>>;
  return size_ * size_t_sum;
}

}  // namespace linalg
}  // namespace dca

#endif  // DCA_LINALG_MULTI_VECTOR_HPP
+16 −3
Original line number Diff line number Diff line
@@ -16,6 +16,7 @@
#include <complex>
#include <cstring>
#include "dca/linalg/device_type.hpp"
#include "cuda_stream.hpp"

#ifdef DCA_HAVE_CUDA
#include <cuda_runtime.h>
@@ -141,10 +142,22 @@ void memoryCopy(ScalarType* dest, int ld_dest, const ScalarType* src, int ld_src
  memoryCopyCpu(dest, ld_dest, src, ld_src, size);
}

// Synchronous 1D memory copy fallback.
template <typename ScalarType>
void memoryCopyAsync(ScalarType* dest, const ScalarType* src, size_t size,
                     const util::CudaStream& /*s*/) {
  memoryCopyCpu(dest, src, size);
}
template <typename ScalarType>
void memoryCopyAsync(ScalarType* dest, int ld_dest, const ScalarType* src, int ld_src,
                     std::pair<int, int> size, const util::CudaStream& /*s*/) {
  memoryCopyCpu(dest, ld_dest, src, ld_src, size);
}

#endif  // DCA_HAVE_CUDA

}  // util
}  // linalg
}  // dca
}  // namespace util
}  // namespace linalg
}  // namespace dca

#endif  // DCA_LINALG_UTIL_COPY_HPP
+3 −3
Original line number Diff line number Diff line
@@ -100,9 +100,9 @@ void MagmaBatchedGemm<ScalarType>::execute(const char transa, const char transb,
                                           const ScalarType beta, const int lda, const int ldb,
                                           const int ldc) {
  // TODO: store in a buffer if the performance gain is necessary.
  a_ptr_dev_.setAsync(a_ptr_, queue_);
  b_ptr_dev_.setAsync(b_ptr_, queue_);
  c_ptr_dev_.setAsync(c_ptr_, queue_);
  a_ptr_dev_.setAsync(a_ptr_, queue_.getStream());
  b_ptr_dev_.setAsync(b_ptr_, queue_.getStream());
  c_ptr_dev_.setAsync(c_ptr_, queue_.getStream());
  copied_.record(queue_);

  const int n_batched = a_ptr_.size();
Loading