Commit cc5f8e44 authored by gbalduzz's avatar gbalduzz
Browse files

Use __costant__ nfft helper.

parent 5ad53f5b
Loading
Loading
Loading
Loading
+0 −1
Original line number Diff line number Diff line
@@ -151,7 +151,6 @@ if (DCA_HAVE_CUDA)
    cuda_utils)
  list(APPEND DCA_LIBS
    blas_kernels
    dnfft_kernels
    lapack_kernels
    mc_kernels
    special_transform_kernels
+6 −1
Original line number Diff line number Diff line
@@ -18,8 +18,13 @@ if (CUDA_FOUND)
  # set(DCA_HAVE_CUDA TRUE CACHE INTERNAL "")
  # dca_add_haves_define(DCA_HAVE_CUDA)
  list(APPEND DCA_CUDA_LIBS ${CUDA_LIBRARIES} ${CUDA_cusparse_LIBRARY} ${CUDA_cublas_LIBRARY})
  CUDA_INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS})
  set(CUDA_SEPARABLE_COMPILATION ON)
  list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr")
#  if(${CMAKE_BUILD_TYPE} STREQUAL "Debug")
#    list(APPEND CUDA_NVCC_FLAGS "-G")
#  endif()
  # list(APPEND CUDA_NVCC_FLAGS_RELEASE "--use_fast_math")
  set(CMAKE_CUDA_STANDARD 14)
endif()

# Find MAGMA.
+13 −14
Original line number Diff line number Diff line
@@ -139,7 +139,6 @@ template <typename ScalarType, typename WDmn, typename RDmn, int oversampling>
void Dnfft1DGpu<ScalarType, WDmn, RDmn, oversampling, CUBIC>::initializeDeviceCoefficients() {
  static std::once_flag flag;
  std::call_once(flag, [&]() {

    const auto& host_coeff = BaseClass::get_cubic_convolution_matrices();
    auto& dev_coeff = get_device_cubic_coeff();
    dev_coeff.resizeNoCopy(host_coeff.size());
@@ -147,13 +146,14 @@ void Dnfft1DGpu<ScalarType, WDmn, RDmn, oversampling, CUBIC>::initializeDeviceCo
               cudaMemcpyHostToDevice);

    const auto& sub_matrix = RDmn::parameter_type::get_subtract_matrix();
    const auto& add_matrix = RDmn::parameter_type::get_add_matrix();
    using PaddedTimeDmn = typename BaseClass::PaddedTimeDmn::parameter_type;
    using WindowTimeDmn = typename BaseClass::WindowFunctionTimeDmn::parameter_type;
    details::initializeNfftHelper<ScalarType>(
        BDmn::dmn_size(), RDmn::dmn_size(), sub_matrix.ptr(), sub_matrix.leadingDimension(),
        oversampling, BaseClass::get_window_sampling(), PaddedTimeDmn::first_element(),
        PaddedTimeDmn::get_Delta(), WindowTimeDmn::first_element(), WindowTimeDmn::get_delta(),
        beta_);
    details::initializeNfftHelper(BDmn::dmn_size(), RDmn::dmn_size(), add_matrix.ptr(),
                                  add_matrix.leadingDimension(), sub_matrix.ptr(),
                                  sub_matrix.leadingDimension(), PaddedTimeDmn::first_element(),
                                  PaddedTimeDmn::get_Delta(), WindowTimeDmn::first_element(),
                                  WindowTimeDmn::get_delta(), beta_);

    assert(cudaPeekAtLastError() == cudaSuccess);
  });
@@ -192,11 +192,10 @@ void Dnfft1DGpu<ScalarType, WDmn, RDmn, oversampling, CUBIC>::accumulate(
  config_left_dev_.setAsync(config_left_, stream_);
  times_dev_.setAsync(times_, stream_);

  details::accumulateOnDevice(M.ptr(), M.leadingDimension(), static_cast<ScalarType>(sign),
                              accumulation_matrix_.ptr(), accumulation_matrix_sqr_.ptr(),
                              accumulation_matrix_.leadingDimension(), config_left_dev_.ptr(),
                              config_right_dev_.ptr(), times_dev_.ptr(),
                              get_device_cubic_coeff().ptr(), n, stream_);
  details::accumulateOnDevice<oversampling, BaseClass::window_sampling_, RealInp, ScalarType>(
      M.ptr(), M.leadingDimension(), static_cast<ScalarType>(sign), accumulation_matrix_.ptr(),
      accumulation_matrix_sqr_.ptr(), accumulation_matrix_.leadingDimension(), config_left_dev_.ptr(),
      config_right_dev_.ptr(), times_dev_.ptr(), get_device_cubic_coeff().ptr(), n, stream_);

  m_copied_event_.record(stream_);
}
@@ -247,8 +246,8 @@ linalg::Vector<ScalarType, linalg::GPU>& Dnfft1DGpu<ScalarType, WDmn, RDmn, over
  return coefficients;
}

}  // nfft
}  // math
}  // dca
}  // namespace nfft
}  // namespace math
}  // namespace dca

#endif  // DCA_MATH_NFFT_DNFFT_1D_GPU_HPP
+13 −14
Original line number Diff line number Diff line
@@ -29,23 +29,22 @@ struct ConfigElem {
  int site;
};

template <typename ScalarType>
void accumulateOnDevice(const ScalarType* M, int ldm, ScalarType sign, ScalarType* out,
                        ScalarType* out_sqr, const int ldo, const ConfigElem* config_left,
                        const ConfigElem* config_right, const ScalarType* tau,
                        const ScalarType* cubic_coeff, int size, cudaStream_t stream_);
template <int oversampling, int window_sampling, typename ScalarIn, typename ScalarOut>
void accumulateOnDevice(const ScalarIn* M, const int ldm, const ScalarIn sign, ScalarOut* out,
                        ScalarOut* out_sqr, const int ldo, const ConfigElem* config_left,
                        const ConfigElem* config_right, const ScalarIn* tau,
                        const ScalarOut* cubic_coeff, const int size, cudaStream_t stream_);

template <typename ScalarType>
void sum(const ScalarType* in, int ldi, ScalarType* out, int ldo, int n, int m, cudaStream_t stream);

template <typename ScalarType>
void initializeNfftHelper(int nb, int nr, const int* sub_r, int lds, int oversampling,
                          int window_sampling, ScalarType t0, ScalarType delta_t,
                          ScalarType t0_window, ScalarType delta_t_window, ScalarType beta);

}  // details
}  // nfft
}  // math
}  // dca
void initializeNfftHelper(int nb, int nc, const int* add_r, int lda, const int* sub_r, int lds,
                          double t0, double delta_t, double t0_window, double delta_t_window,
                          double beta);

}  // namespace details
}  // namespace nfft
}  // namespace math
}  // namespace dca

#endif  // DCA_MATH_NFFT_KERNELS_INTERFACE_HPP
+0 −191
Original line number Diff line number Diff line
// Copyright (C) 2018 ETH Zurich
// Copyright (C) 2018 UT-Battelle, LLC
// All rights reserved.
//
// See LICENSE.txt for terms of usage.
// See CITATION.txt for citation guidelines if you use this code for scientific publications.
//
// Author: Giovanni Balduzzi (gbalduzz@itp.phys.ethz.ch)
//
// Helper class for computing interpolation indices during the single particle accumulation.

#ifndef DCA_INCLUDE_DCA_MATH_NFFT_NFFT_HELPER
#define DCA_INCLUDE_DCA_MATH_NFFT_NFFT_HELPER

#include <array>
#include <cassert>
#include <cuda.h>
#include <stdexcept>

#include "dca/math/nfft/nfft_mode_names.hpp"

namespace dca {
namespace math {
namespace nfft {
namespace details {
// dca::math::nfft::details::

template <typename ScalarType>
class NfftHelper {
public:
  __host__ void set(int nb, int nr, const int* sub_r, int lds, int oversampling,
                    int window_sampling, ScalarType t0, ScalarType delta_t, ScalarType t0_window,
                    ScalarType delta_t_window, ScalarType beta);

  NfftHelper(const NfftHelper& other) = default;

  __host__ bool isInitialized() const {
    return parameters_int_ != nullptr;
  }

  __device__ int inline computeLinearIndex(int b1, int b2, int r1, int r2) const;

  __device__ ScalarType inline computeTau(ScalarType t1, ScalarType t2) const;

  __device__ inline int deltaR(int r1, int r2) const;

  template <NfftModeNames mode>
  __device__ inline void computeInterpolationIndices(ScalarType t, int& t_idx, int& conv_coeff_idx,
                                                     ScalarType& delta_t) const;

  __device__ inline int get_oversampling() const {
    return parameters_int_[3];
  }

protected:
  // This object can be constructed only through its derived class.
  NfftHelper() = default;

  int* sub_matrix_ = nullptr;

  // Stores in this order { nb, nr, lds, oversampling, window_sampling}
  int* parameters_int_ = nullptr;
  // Stores in this order { t0, delta_t, t0_window, delta_t_window, one_div_delta,
  // one_div_delta_window, tau_scale}
  ScalarType* parameters_real_ = nullptr;
};

template <typename ScalarType>
class NfftHelperManager : public NfftHelper<ScalarType> {
public:
  NfftHelperManager() = default;

  __host__ ~NfftHelperManager();

  __host__ __device__ NfftHelperManager(const NfftHelperManager& other) = delete;

  __host__ void set(int nb, int nr, const int* sub_r, int lds, int oversampling,
                    int window_sampling, ScalarType t0, ScalarType delta_t, ScalarType t0_window,
                    ScalarType delta_t_window, ScalarType beta);

  int get_oversampling() const {
    assert(NfftHelper<ScalarType>::isInitialized());
    return oversampling_;
  }

private:
  int oversampling_ = -1;
};

template <typename ScalarType>
__host__ void NfftHelper<ScalarType>::set(const int nb, const int nr, const int* sub_r, int lds,
                                          int oversampling, int window_sampling, ScalarType t0,
                                          ScalarType delta_t, const ScalarType t0_window,
                                          const ScalarType delta_t_window, const ScalarType beta) {
  if (isInitialized())
    throw(std::logic_error("already initialized."));

  cudaMalloc(&sub_matrix_, sizeof(int) * lds * nr);
  cudaMemcpy(sub_matrix_, sub_r, sizeof(int) * lds * nr, cudaMemcpyHostToDevice);

  const std::array<int, 5> parameters_int_host{nb, nr, lds, oversampling, window_sampling};
  cudaMalloc(&parameters_int_, sizeof(int) * parameters_int_host.size());
  cudaMemcpy(parameters_int_, parameters_int_host.data(), sizeof(int) * parameters_int_host.size(),
             cudaMemcpyHostToDevice);

  ScalarType one(1);
  const ScalarType tau_scale = ScalarType(0.5) / beta;
  const std::array<ScalarType, 7> parameters_real_host{
      t0, delta_t, t0_window, delta_t_window, one / delta_t, one / delta_t_window, tau_scale};
  cudaMalloc(&parameters_real_, sizeof(ScalarType) * parameters_real_host.size());
  cudaMemcpy(parameters_real_, parameters_real_host.data(),
             sizeof(ScalarType) * parameters_real_host.size(), cudaMemcpyHostToDevice);
}

template <typename ScalarType>
__device__ int NfftHelper<ScalarType>::deltaR(const int r1, const int r2) const {
  const int ld = parameters_int_[2];
  return sub_matrix_[r1 + ld * r2];
}

template <typename ScalarType>
__host__ void NfftHelperManager<ScalarType>::set(const int nb, const int nr, const int* sub_r,
                                                 int lds, int oversampling, int window_sampling,
                                                 ScalarType t0, ScalarType delta_t,
                                                 const ScalarType t0_window,
                                                 const ScalarType delta_t_window,
                                                 const ScalarType beta) {
  NfftHelper<ScalarType>::set(nb, nr, sub_r, lds, oversampling, window_sampling, t0, delta_t,
                              t0_window, delta_t_window, beta);
  oversampling_ = oversampling;
}

template <typename ScalarType>
__device__ int inline NfftHelper<ScalarType>::computeLinearIndex(const int b1, const int b2,
                                                                 const int r1, const int r2) const {
  const int delta_r = deltaR(r2, r1);
  const int nb = parameters_int_[0];
  return b1 + nb * (b2 + nb * delta_r);
}

template <typename ScalarType>
__device__ ScalarType NfftHelper<ScalarType>::computeTau(ScalarType t1, ScalarType t2) const {
  const ScalarType tau_scale = parameters_real_[6];
  return (t1 - t2) * tau_scale;
}

template <typename ScalarType>
template <NfftModeNames mode>
__device__ void NfftHelper<ScalarType>::computeInterpolationIndices(ScalarType t, int& t_idx,
                                                                    int& conv_coeff_idx,
                                                                    ScalarType& delta_t) const {
  static_assert(mode == CUBIC || mode == LINEAR,
                "This method is defined only for linear or cubic interpolations.");

  const ScalarType t0 = parameters_real_[0];
  const ScalarType delta_t_padded = parameters_real_[1];
  const ScalarType t0_window = parameters_real_[2];
  const ScalarType delta_t_window = parameters_real_[3];
  auto get_tau = [=](int index) { return t0 + index * delta_t_padded; };
  auto get_window_tau = [&](int index) { return t0_window + index * delta_t_window; };
  const ScalarType one_div_delta = parameters_real_[4];
  const ScalarType one_div_delta_wnd = parameters_real_[5];

  t_idx = (t - t0) * one_div_delta;
  const int t_wnd_idx = (get_tau(t_idx) - t - t0_window) * one_div_delta_wnd;
  delta_t = get_tau(t_idx) - t - get_window_tau(t_wnd_idx);

  const int oversampling = parameters_int_[3];
  const int window_sampling = parameters_int_[4];
  t_idx -= oversampling;
  const int delta_tau_index = t_wnd_idx - oversampling * window_sampling;
  const int j = delta_tau_index % window_sampling;
  const int i = (delta_tau_index - j) / window_sampling;

  constexpr int interpolation_size = mode == CUBIC ? 4 : 2;
  conv_coeff_idx = interpolation_size * i + interpolation_size * 4 * oversampling * j;
}

template <typename ScalarType>
__host__ NfftHelperManager<ScalarType>::~NfftHelperManager() {
  cudaFree(NfftHelper<ScalarType>::sub_matrix_);
  cudaFree(NfftHelper<ScalarType>::parameters_int_);
  cudaFree(NfftHelper<ScalarType>::parameters_real_);
}

}  // details
}  // nfft
}  // math
}  // dca

#endif  // DCA_INCLUDE_DCA_MATH_NFFT_NFFT_HELPER
Loading