Commit be9df424 authored by gbalduzz's avatar gbalduzz
Browse files

Merge branch 'generic_container' into autocorrelation

parents c1a20408 30bb897e
Loading
Loading
Loading
Loading
+16 −3
Original line number Diff line number Diff line
@@ -16,6 +16,7 @@
#include <complex>
#include <cstring>
#include "dca/linalg/device_type.hpp"
#include "cuda_stream.hpp"

#ifdef DCA_HAVE_CUDA
#include <cuda_runtime.h>
@@ -141,10 +142,22 @@ void memoryCopy(ScalarType* dest, int ld_dest, const ScalarType* src, int ld_src
  memoryCopyCpu(dest, ld_dest, src, ld_src, size);
}

// Synchronous 1D memory copy fallback.
template <typename ScalarType>
void memoryCopyAsync(ScalarType* dest, const ScalarType* src, size_t size,
                     const util::CudaStream& /*s*/) {
  memoryCopyCpu(dest, src, size);
}
template <typename ScalarType>
void memoryCopyAsync(ScalarType* dest, int ld_dest, const ScalarType* src, int ld_src,
                     std::pair<int, int> size, const util::CudaStream& /*s*/) {
  memoryCopyCpu(dest, ld_dest, src, ld_src, size);
}

#endif  // DCA_HAVE_CUDA

}  // util
}  // linalg
}  // dca
}  // namespace util
}  // namespace linalg
}  // namespace dca

#endif  // DCA_LINALG_UTIL_COPY_HPP
+5 −0
Original line number Diff line number Diff line
@@ -31,10 +31,15 @@ public:
  }

  CudaStream(const CudaStream& other) = delete;
  CudaStream& operator=(const CudaStream& other) = delete;

  CudaStream(CudaStream&& other) {
    std::swap(stream_, other.stream_);
  }
  CudaStream& operator=(CudaStream&& other) {
    std::swap(stream_, other.stream_);
    return *this;
  }

  void sync() const {
    checkRC(cudaStreamSynchronize(stream_));
+16 −1
Original line number Diff line number Diff line
@@ -36,8 +36,17 @@ public:
  }

  MagmaQueue(const MagmaQueue& rhs) = delete;
  MagmaQueue& operator=(const MagmaQueue& rhs) = delete;

  MagmaQueue(MagmaQueue&& rhs) = default;
  MagmaQueue(MagmaQueue&& rhs) : CudaStream(std::move(rhs)) {
    swapMembers(rhs);
  }

  MagmaQueue& operator=(MagmaQueue&& rhs) {
    CudaStream::operator=(std::move(rhs));
    swapMembers(rhs);
    return *this;
  }

  ~MagmaQueue() {
    magma_queue_destroy(queue_);
@@ -50,6 +59,12 @@ public:
  }

private:
  void swapMembers(MagmaQueue& rhs) {
    std::swap(cublas_handle_, rhs.cublas_handle_);
    std::swap(cusparse_handle_, rhs.cusparse_handle_);
    std::swap(queue_, rhs.queue_);
  }

  magma_queue_t queue_ = nullptr;
  cublasHandle_t cublas_handle_ = nullptr;
  cusparseHandle_t cusparse_handle_ = nullptr;
+15 −14
Original line number Diff line number Diff line
@@ -19,6 +19,7 @@
#include "dca/linalg/lapack/magma.hpp"
#include "dca/linalg/util/allocators/vectors_typedefs.hpp"
#include "dca/linalg/util/cuda_event.hpp"
#include "dca/linalg/util/magma_queue.hpp"
#include "dca/linalg/vector.hpp"

namespace dca {
@@ -30,10 +31,10 @@ template <typename ScalarType>
class MagmaVBatchedGemm {
public:
  // Creates a plan for a batched gemm with variable size.
  MagmaVBatchedGemm(magma_queue_t queue);
  MagmaVBatchedGemm(const linalg::util::MagmaQueue& queue);
  // Creates a plan and allocates the memory for the arguments of `size`
  // multiplications.
  MagmaVBatchedGemm(int size, magma_queue_t queue);
  MagmaVBatchedGemm(int size, const linalg::util::MagmaQueue& queue);

  // Allocates the memory for the arguments of `size` multiplications.
  void reserve(int size);
@@ -52,8 +53,8 @@ public:
  void synchronizeCopy();

private:
  magma_queue_t queue_;
  const cudaStream_t stream_;
  const linalg::util::MagmaQueue& queue_;
  const linalg::util::CudaStream& stream_;
  CudaEvent copied_;

  linalg::util::HostVector<const ScalarType*> a_ptr_, b_ptr_;
@@ -67,11 +68,11 @@ private:
};

template <typename ScalarType>
MagmaVBatchedGemm<ScalarType>::MagmaVBatchedGemm(magma_queue_t queue)
    : queue_(queue), stream_(magma_queue_get_cuda_stream(queue_)), m_max_(0), n_max_(0), k_max_(0) {}
MagmaVBatchedGemm<ScalarType>::MagmaVBatchedGemm(const linalg::util::MagmaQueue& queue)
    : queue_(queue), stream_(queue), m_max_(0), n_max_(0), k_max_(0) {}

template <typename ScalarType>
MagmaVBatchedGemm<ScalarType>::MagmaVBatchedGemm(const int size, magma_queue_t queue)
MagmaVBatchedGemm<ScalarType>::MagmaVBatchedGemm(const int size, const linalg::util::MagmaQueue& queue)
    : MagmaVBatchedGemm(queue) {
  reserve(size);
}
@@ -126,10 +127,10 @@ void MagmaVBatchedGemm<ScalarType>::execute(const char transa, const char transb
  copied_.record(stream_);

  const int n_batched = a_ptr_.size();
  magma::magmablas_gemm_vbatched_max_nocheck(transa, transb, m_dev_.ptr(), n_dev_.ptr(), k_dev_.ptr(),
                                 ScalarType(1), a_ptr_dev_.ptr(), lda_dev_.ptr(), b_ptr_dev_.ptr(),
                                 ldb_dev_.ptr(), ScalarType(0), c_ptr_dev_.ptr(), ldc_dev_.ptr(),
                                 n_batched, m_max_, n_max_, k_max_, queue_);
  magma::magmablas_gemm_vbatched_max_nocheck(
      transa, transb, m_dev_.ptr(), n_dev_.ptr(), k_dev_.ptr(), ScalarType(1), a_ptr_dev_.ptr(),
      lda_dev_.ptr(), b_ptr_dev_.ptr(), ldb_dev_.ptr(), ScalarType(0), c_ptr_dev_.ptr(),
      ldc_dev_.ptr(), n_batched, m_max_, n_max_, k_max_, queue_);

  assert(cudaPeekAtLastError() == cudaSuccess);
}
@@ -151,9 +152,9 @@ void MagmaVBatchedGemm<ScalarType>::synchronizeCopy() {
  m_max_ = n_max_ = k_max_ = 0;
}

}  // util
}  // linalg
}  // dca
}  // namespace util
}  // namespace linalg
}  // namespace dca

#endif  // DCA_HAVE_CUDA
#endif  // DCA_LINALG_UTIL_MAGMA_VBATCHED_GEMM_HPP
+9 −4
Original line number Diff line number Diff line
@@ -19,6 +19,7 @@
#include <stdexcept>

#include "dca/linalg/device_type.hpp"
#include "dca/linalg/util/cuda_stream.hpp"
#include "dca/util/ignore.hpp"

#ifdef DCA_HAVE_CUDA
@@ -48,11 +49,10 @@ struct Memory<CPU> {
      std::complex<ScalarType>* ptr, size_t size) {
    std::memset(static_cast<void*>(ptr), 0, size * sizeof(std::complex<ScalarType>));
  }

  // Do nothing for non arithmetic types.
  template <typename ScalarType>
  static std::enable_if_t<std::is_arithmetic<ScalarType>::value == false, void> setToZero(
      ScalarType /*ptr*/, size_t /*size*/) {}
  static void setToZeroAsync(ScalarType* ptr, size_t size, const CudaStream& /*s*/) {
    setToZero(ptr, size);
  }
};

#ifdef DCA_HAVE_CUDA
@@ -75,6 +75,11 @@ struct Memory<GPU> {
  template <typename ScalarType>
  static std::enable_if_t<std::is_arithmetic<ScalarType>::value == false, void> setToZero(
      ScalarType /*ptr*/, size_t /*size*/) {}

  template <typename ScalarType>
  static void setToZeroAsync(ScalarType* ptr, size_t size, const CudaStream& stream) {
    cudaMemsetAsync(ptr, 0, size * sizeof(ScalarType), stream);
  }
};
#endif  // DCA_HAVE_CUDA

Loading