Loading CMakeLists.txt +0 −1 Original line number Diff line number Diff line Loading @@ -151,7 +151,6 @@ if (DCA_HAVE_CUDA) cuda_utils) list(APPEND DCA_LIBS blas_kernels dnfft_kernels lapack_kernels mc_kernels special_transform_kernels Loading cmake/dca_cuda.cmake +6 −1 Original line number Diff line number Diff line Loading @@ -18,8 +18,13 @@ if (CUDA_FOUND) # set(DCA_HAVE_CUDA TRUE CACHE INTERNAL "") # dca_add_haves_define(DCA_HAVE_CUDA) list(APPEND DCA_CUDA_LIBS ${CUDA_LIBRARIES} ${CUDA_cusparse_LIBRARY} ${CUDA_cublas_LIBRARY}) CUDA_INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS}) set(CUDA_SEPARABLE_COMPILATION ON) list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr") # if(${CMAKE_BUILD_TYPE} STREQUAL "Debug") # list(APPEND CUDA_NVCC_FLAGS "-G") # endif() # list(APPEND CUDA_NVCC_FLAGS_RELEASE "--use_fast_math") set(CMAKE_CUDA_STANDARD 14) endif() # Find MAGMA. Loading include/dca/math/nfft/dnfft_1d_gpu.hpp +13 −14 Original line number Diff line number Diff line Loading @@ -139,7 +139,6 @@ template <typename ScalarType, typename WDmn, typename RDmn, int oversampling> void Dnfft1DGpu<ScalarType, WDmn, RDmn, oversampling, CUBIC>::initializeDeviceCoefficients() { static std::once_flag flag; std::call_once(flag, [&]() { const auto& host_coeff = BaseClass::get_cubic_convolution_matrices(); auto& dev_coeff = get_device_cubic_coeff(); dev_coeff.resizeNoCopy(host_coeff.size()); Loading @@ -147,13 +146,14 @@ void Dnfft1DGpu<ScalarType, WDmn, RDmn, oversampling, CUBIC>::initializeDeviceCo cudaMemcpyHostToDevice); const auto& sub_matrix = RDmn::parameter_type::get_subtract_matrix(); const auto& add_matrix = RDmn::parameter_type::get_add_matrix(); using PaddedTimeDmn = typename BaseClass::PaddedTimeDmn::parameter_type; using WindowTimeDmn = typename BaseClass::WindowFunctionTimeDmn::parameter_type; details::initializeNfftHelper<ScalarType>( BDmn::dmn_size(), RDmn::dmn_size(), sub_matrix.ptr(), sub_matrix.leadingDimension(), oversampling, BaseClass::get_window_sampling(), PaddedTimeDmn::first_element(), PaddedTimeDmn::get_Delta(), WindowTimeDmn::first_element(), WindowTimeDmn::get_delta(), beta_); details::initializeNfftHelper(BDmn::dmn_size(), RDmn::dmn_size(), add_matrix.ptr(), add_matrix.leadingDimension(), sub_matrix.ptr(), sub_matrix.leadingDimension(), PaddedTimeDmn::first_element(), PaddedTimeDmn::get_Delta(), WindowTimeDmn::first_element(), WindowTimeDmn::get_delta(), beta_); assert(cudaPeekAtLastError() == cudaSuccess); }); Loading Loading @@ -192,11 +192,10 @@ void Dnfft1DGpu<ScalarType, WDmn, RDmn, oversampling, CUBIC>::accumulate( config_left_dev_.setAsync(config_left_, stream_); times_dev_.setAsync(times_, stream_); details::accumulateOnDevice(M.ptr(), M.leadingDimension(), static_cast<ScalarType>(sign), accumulation_matrix_.ptr(), accumulation_matrix_sqr_.ptr(), accumulation_matrix_.leadingDimension(), config_left_dev_.ptr(), config_right_dev_.ptr(), times_dev_.ptr(), get_device_cubic_coeff().ptr(), n, stream_); details::accumulateOnDevice<oversampling, BaseClass::window_sampling_, RealInp, ScalarType>( M.ptr(), M.leadingDimension(), static_cast<ScalarType>(sign), accumulation_matrix_.ptr(), accumulation_matrix_sqr_.ptr(), accumulation_matrix_.leadingDimension(), config_left_dev_.ptr(), config_right_dev_.ptr(), times_dev_.ptr(), get_device_cubic_coeff().ptr(), n, stream_); m_copied_event_.record(stream_); } Loading Loading @@ -247,8 +246,8 @@ linalg::Vector<ScalarType, linalg::GPU>& Dnfft1DGpu<ScalarType, WDmn, RDmn, over return coefficients; } } // nfft } // math } // dca } // namespace nfft } // namespace math } // namespace dca #endif // DCA_MATH_NFFT_DNFFT_1D_GPU_HPP include/dca/math/nfft/kernels_interface.hpp +13 −14 Original line number Diff line number Diff line Loading @@ -29,23 +29,22 @@ struct ConfigElem { int site; }; template <typename ScalarType> void accumulateOnDevice(const ScalarType* M, int ldm, ScalarType sign, ScalarType* out, ScalarType* out_sqr, const int ldo, const ConfigElem* config_left, const ConfigElem* config_right, const ScalarType* tau, const ScalarType* cubic_coeff, int size, cudaStream_t stream_); template <int oversampling, int window_sampling, typename ScalarIn, typename ScalarOut> void accumulateOnDevice(const ScalarIn* M, const int ldm, const ScalarIn sign, ScalarOut* out, ScalarOut* out_sqr, const int ldo, const ConfigElem* config_left, const ConfigElem* config_right, const ScalarIn* tau, const ScalarOut* cubic_coeff, const int size, cudaStream_t stream_); template <typename ScalarType> void sum(const ScalarType* in, int ldi, ScalarType* out, int ldo, int n, int m, cudaStream_t stream); template <typename ScalarType> void initializeNfftHelper(int nb, int nr, const int* sub_r, int lds, int oversampling, int window_sampling, ScalarType t0, ScalarType delta_t, ScalarType t0_window, ScalarType delta_t_window, ScalarType beta); } // details } // nfft } // math } // dca void initializeNfftHelper(int nb, int nc, const int* add_r, int lda, const int* sub_r, int lds, double t0, double delta_t, double t0_window, double delta_t_window, double beta); } // namespace details } // namespace nfft } // namespace math } // namespace dca #endif // DCA_MATH_NFFT_KERNELS_INTERFACE_HPP include/dca/math/nfft/nfft_helper.cu.hppdeleted 100644 → 0 +0 −191 Original line number Diff line number Diff line // Copyright (C) 2018 ETH Zurich // Copyright (C) 2018 UT-Battelle, LLC // All rights reserved. // // See LICENSE.txt for terms of usage. // See CITATION.txt for citation guidelines if you use this code for scientific publications. // // Author: Giovanni Balduzzi (gbalduzz@itp.phys.ethz.ch) // // Helper class for computing interpolation indices during the single particle accumulation. #ifndef DCA_INCLUDE_DCA_MATH_NFFT_NFFT_HELPER #define DCA_INCLUDE_DCA_MATH_NFFT_NFFT_HELPER #include <array> #include <cassert> #include <cuda.h> #include <stdexcept> #include "dca/math/nfft/nfft_mode_names.hpp" namespace dca { namespace math { namespace nfft { namespace details { // dca::math::nfft::details:: template <typename ScalarType> class NfftHelper { public: __host__ void set(int nb, int nr, const int* sub_r, int lds, int oversampling, int window_sampling, ScalarType t0, ScalarType delta_t, ScalarType t0_window, ScalarType delta_t_window, ScalarType beta); NfftHelper(const NfftHelper& other) = default; __host__ bool isInitialized() const { return parameters_int_ != nullptr; } __device__ int inline computeLinearIndex(int b1, int b2, int r1, int r2) const; __device__ ScalarType inline computeTau(ScalarType t1, ScalarType t2) const; __device__ inline int deltaR(int r1, int r2) const; template <NfftModeNames mode> __device__ inline void computeInterpolationIndices(ScalarType t, int& t_idx, int& conv_coeff_idx, ScalarType& delta_t) const; __device__ inline int get_oversampling() const { return parameters_int_[3]; } protected: // This object can be constructed only through its derived class. NfftHelper() = default; int* sub_matrix_ = nullptr; // Stores in this order { nb, nr, lds, oversampling, window_sampling} int* parameters_int_ = nullptr; // Stores in this order { t0, delta_t, t0_window, delta_t_window, one_div_delta, // one_div_delta_window, tau_scale} ScalarType* parameters_real_ = nullptr; }; template <typename ScalarType> class NfftHelperManager : public NfftHelper<ScalarType> { public: NfftHelperManager() = default; __host__ ~NfftHelperManager(); __host__ __device__ NfftHelperManager(const NfftHelperManager& other) = delete; __host__ void set(int nb, int nr, const int* sub_r, int lds, int oversampling, int window_sampling, ScalarType t0, ScalarType delta_t, ScalarType t0_window, ScalarType delta_t_window, ScalarType beta); int get_oversampling() const { assert(NfftHelper<ScalarType>::isInitialized()); return oversampling_; } private: int oversampling_ = -1; }; template <typename ScalarType> __host__ void NfftHelper<ScalarType>::set(const int nb, const int nr, const int* sub_r, int lds, int oversampling, int window_sampling, ScalarType t0, ScalarType delta_t, const ScalarType t0_window, const ScalarType delta_t_window, const ScalarType beta) { if (isInitialized()) throw(std::logic_error("already initialized.")); cudaMalloc(&sub_matrix_, sizeof(int) * lds * nr); cudaMemcpy(sub_matrix_, sub_r, sizeof(int) * lds * nr, cudaMemcpyHostToDevice); const std::array<int, 5> parameters_int_host{nb, nr, lds, oversampling, window_sampling}; cudaMalloc(¶meters_int_, sizeof(int) * parameters_int_host.size()); cudaMemcpy(parameters_int_, parameters_int_host.data(), sizeof(int) * parameters_int_host.size(), cudaMemcpyHostToDevice); ScalarType one(1); const ScalarType tau_scale = ScalarType(0.5) / beta; const std::array<ScalarType, 7> parameters_real_host{ t0, delta_t, t0_window, delta_t_window, one / delta_t, one / delta_t_window, tau_scale}; cudaMalloc(¶meters_real_, sizeof(ScalarType) * parameters_real_host.size()); cudaMemcpy(parameters_real_, parameters_real_host.data(), sizeof(ScalarType) * parameters_real_host.size(), cudaMemcpyHostToDevice); } template <typename ScalarType> __device__ int NfftHelper<ScalarType>::deltaR(const int r1, const int r2) const { const int ld = parameters_int_[2]; return sub_matrix_[r1 + ld * r2]; } template <typename ScalarType> __host__ void NfftHelperManager<ScalarType>::set(const int nb, const int nr, const int* sub_r, int lds, int oversampling, int window_sampling, ScalarType t0, ScalarType delta_t, const ScalarType t0_window, const ScalarType delta_t_window, const ScalarType beta) { NfftHelper<ScalarType>::set(nb, nr, sub_r, lds, oversampling, window_sampling, t0, delta_t, t0_window, delta_t_window, beta); oversampling_ = oversampling; } template <typename ScalarType> __device__ int inline NfftHelper<ScalarType>::computeLinearIndex(const int b1, const int b2, const int r1, const int r2) const { const int delta_r = deltaR(r2, r1); const int nb = parameters_int_[0]; return b1 + nb * (b2 + nb * delta_r); } template <typename ScalarType> __device__ ScalarType NfftHelper<ScalarType>::computeTau(ScalarType t1, ScalarType t2) const { const ScalarType tau_scale = parameters_real_[6]; return (t1 - t2) * tau_scale; } template <typename ScalarType> template <NfftModeNames mode> __device__ void NfftHelper<ScalarType>::computeInterpolationIndices(ScalarType t, int& t_idx, int& conv_coeff_idx, ScalarType& delta_t) const { static_assert(mode == CUBIC || mode == LINEAR, "This method is defined only for linear or cubic interpolations."); const ScalarType t0 = parameters_real_[0]; const ScalarType delta_t_padded = parameters_real_[1]; const ScalarType t0_window = parameters_real_[2]; const ScalarType delta_t_window = parameters_real_[3]; auto get_tau = [=](int index) { return t0 + index * delta_t_padded; }; auto get_window_tau = [&](int index) { return t0_window + index * delta_t_window; }; const ScalarType one_div_delta = parameters_real_[4]; const ScalarType one_div_delta_wnd = parameters_real_[5]; t_idx = (t - t0) * one_div_delta; const int t_wnd_idx = (get_tau(t_idx) - t - t0_window) * one_div_delta_wnd; delta_t = get_tau(t_idx) - t - get_window_tau(t_wnd_idx); const int oversampling = parameters_int_[3]; const int window_sampling = parameters_int_[4]; t_idx -= oversampling; const int delta_tau_index = t_wnd_idx - oversampling * window_sampling; const int j = delta_tau_index % window_sampling; const int i = (delta_tau_index - j) / window_sampling; constexpr int interpolation_size = mode == CUBIC ? 4 : 2; conv_coeff_idx = interpolation_size * i + interpolation_size * 4 * oversampling * j; } template <typename ScalarType> __host__ NfftHelperManager<ScalarType>::~NfftHelperManager() { cudaFree(NfftHelper<ScalarType>::sub_matrix_); cudaFree(NfftHelper<ScalarType>::parameters_int_); cudaFree(NfftHelper<ScalarType>::parameters_real_); } } // details } // nfft } // math } // dca #endif // DCA_INCLUDE_DCA_MATH_NFFT_NFFT_HELPER Loading
CMakeLists.txt +0 −1 Original line number Diff line number Diff line Loading @@ -151,7 +151,6 @@ if (DCA_HAVE_CUDA) cuda_utils) list(APPEND DCA_LIBS blas_kernels dnfft_kernels lapack_kernels mc_kernels special_transform_kernels Loading
cmake/dca_cuda.cmake +6 −1 Original line number Diff line number Diff line Loading @@ -18,8 +18,13 @@ if (CUDA_FOUND) # set(DCA_HAVE_CUDA TRUE CACHE INTERNAL "") # dca_add_haves_define(DCA_HAVE_CUDA) list(APPEND DCA_CUDA_LIBS ${CUDA_LIBRARIES} ${CUDA_cusparse_LIBRARY} ${CUDA_cublas_LIBRARY}) CUDA_INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS}) set(CUDA_SEPARABLE_COMPILATION ON) list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr") # if(${CMAKE_BUILD_TYPE} STREQUAL "Debug") # list(APPEND CUDA_NVCC_FLAGS "-G") # endif() # list(APPEND CUDA_NVCC_FLAGS_RELEASE "--use_fast_math") set(CMAKE_CUDA_STANDARD 14) endif() # Find MAGMA. Loading
include/dca/math/nfft/dnfft_1d_gpu.hpp +13 −14 Original line number Diff line number Diff line Loading @@ -139,7 +139,6 @@ template <typename ScalarType, typename WDmn, typename RDmn, int oversampling> void Dnfft1DGpu<ScalarType, WDmn, RDmn, oversampling, CUBIC>::initializeDeviceCoefficients() { static std::once_flag flag; std::call_once(flag, [&]() { const auto& host_coeff = BaseClass::get_cubic_convolution_matrices(); auto& dev_coeff = get_device_cubic_coeff(); dev_coeff.resizeNoCopy(host_coeff.size()); Loading @@ -147,13 +146,14 @@ void Dnfft1DGpu<ScalarType, WDmn, RDmn, oversampling, CUBIC>::initializeDeviceCo cudaMemcpyHostToDevice); const auto& sub_matrix = RDmn::parameter_type::get_subtract_matrix(); const auto& add_matrix = RDmn::parameter_type::get_add_matrix(); using PaddedTimeDmn = typename BaseClass::PaddedTimeDmn::parameter_type; using WindowTimeDmn = typename BaseClass::WindowFunctionTimeDmn::parameter_type; details::initializeNfftHelper<ScalarType>( BDmn::dmn_size(), RDmn::dmn_size(), sub_matrix.ptr(), sub_matrix.leadingDimension(), oversampling, BaseClass::get_window_sampling(), PaddedTimeDmn::first_element(), PaddedTimeDmn::get_Delta(), WindowTimeDmn::first_element(), WindowTimeDmn::get_delta(), beta_); details::initializeNfftHelper(BDmn::dmn_size(), RDmn::dmn_size(), add_matrix.ptr(), add_matrix.leadingDimension(), sub_matrix.ptr(), sub_matrix.leadingDimension(), PaddedTimeDmn::first_element(), PaddedTimeDmn::get_Delta(), WindowTimeDmn::first_element(), WindowTimeDmn::get_delta(), beta_); assert(cudaPeekAtLastError() == cudaSuccess); }); Loading Loading @@ -192,11 +192,10 @@ void Dnfft1DGpu<ScalarType, WDmn, RDmn, oversampling, CUBIC>::accumulate( config_left_dev_.setAsync(config_left_, stream_); times_dev_.setAsync(times_, stream_); details::accumulateOnDevice(M.ptr(), M.leadingDimension(), static_cast<ScalarType>(sign), accumulation_matrix_.ptr(), accumulation_matrix_sqr_.ptr(), accumulation_matrix_.leadingDimension(), config_left_dev_.ptr(), config_right_dev_.ptr(), times_dev_.ptr(), get_device_cubic_coeff().ptr(), n, stream_); details::accumulateOnDevice<oversampling, BaseClass::window_sampling_, RealInp, ScalarType>( M.ptr(), M.leadingDimension(), static_cast<ScalarType>(sign), accumulation_matrix_.ptr(), accumulation_matrix_sqr_.ptr(), accumulation_matrix_.leadingDimension(), config_left_dev_.ptr(), config_right_dev_.ptr(), times_dev_.ptr(), get_device_cubic_coeff().ptr(), n, stream_); m_copied_event_.record(stream_); } Loading Loading @@ -247,8 +246,8 @@ linalg::Vector<ScalarType, linalg::GPU>& Dnfft1DGpu<ScalarType, WDmn, RDmn, over return coefficients; } } // nfft } // math } // dca } // namespace nfft } // namespace math } // namespace dca #endif // DCA_MATH_NFFT_DNFFT_1D_GPU_HPP
include/dca/math/nfft/kernels_interface.hpp +13 −14 Original line number Diff line number Diff line Loading @@ -29,23 +29,22 @@ struct ConfigElem { int site; }; template <typename ScalarType> void accumulateOnDevice(const ScalarType* M, int ldm, ScalarType sign, ScalarType* out, ScalarType* out_sqr, const int ldo, const ConfigElem* config_left, const ConfigElem* config_right, const ScalarType* tau, const ScalarType* cubic_coeff, int size, cudaStream_t stream_); template <int oversampling, int window_sampling, typename ScalarIn, typename ScalarOut> void accumulateOnDevice(const ScalarIn* M, const int ldm, const ScalarIn sign, ScalarOut* out, ScalarOut* out_sqr, const int ldo, const ConfigElem* config_left, const ConfigElem* config_right, const ScalarIn* tau, const ScalarOut* cubic_coeff, const int size, cudaStream_t stream_); template <typename ScalarType> void sum(const ScalarType* in, int ldi, ScalarType* out, int ldo, int n, int m, cudaStream_t stream); template <typename ScalarType> void initializeNfftHelper(int nb, int nr, const int* sub_r, int lds, int oversampling, int window_sampling, ScalarType t0, ScalarType delta_t, ScalarType t0_window, ScalarType delta_t_window, ScalarType beta); } // details } // nfft } // math } // dca void initializeNfftHelper(int nb, int nc, const int* add_r, int lda, const int* sub_r, int lds, double t0, double delta_t, double t0_window, double delta_t_window, double beta); } // namespace details } // namespace nfft } // namespace math } // namespace dca #endif // DCA_MATH_NFFT_KERNELS_INTERFACE_HPP
include/dca/math/nfft/nfft_helper.cu.hppdeleted 100644 → 0 +0 −191 Original line number Diff line number Diff line // Copyright (C) 2018 ETH Zurich // Copyright (C) 2018 UT-Battelle, LLC // All rights reserved. // // See LICENSE.txt for terms of usage. // See CITATION.txt for citation guidelines if you use this code for scientific publications. // // Author: Giovanni Balduzzi (gbalduzz@itp.phys.ethz.ch) // // Helper class for computing interpolation indices during the single particle accumulation. #ifndef DCA_INCLUDE_DCA_MATH_NFFT_NFFT_HELPER #define DCA_INCLUDE_DCA_MATH_NFFT_NFFT_HELPER #include <array> #include <cassert> #include <cuda.h> #include <stdexcept> #include "dca/math/nfft/nfft_mode_names.hpp" namespace dca { namespace math { namespace nfft { namespace details { // dca::math::nfft::details:: template <typename ScalarType> class NfftHelper { public: __host__ void set(int nb, int nr, const int* sub_r, int lds, int oversampling, int window_sampling, ScalarType t0, ScalarType delta_t, ScalarType t0_window, ScalarType delta_t_window, ScalarType beta); NfftHelper(const NfftHelper& other) = default; __host__ bool isInitialized() const { return parameters_int_ != nullptr; } __device__ int inline computeLinearIndex(int b1, int b2, int r1, int r2) const; __device__ ScalarType inline computeTau(ScalarType t1, ScalarType t2) const; __device__ inline int deltaR(int r1, int r2) const; template <NfftModeNames mode> __device__ inline void computeInterpolationIndices(ScalarType t, int& t_idx, int& conv_coeff_idx, ScalarType& delta_t) const; __device__ inline int get_oversampling() const { return parameters_int_[3]; } protected: // This object can be constructed only through its derived class. NfftHelper() = default; int* sub_matrix_ = nullptr; // Stores in this order { nb, nr, lds, oversampling, window_sampling} int* parameters_int_ = nullptr; // Stores in this order { t0, delta_t, t0_window, delta_t_window, one_div_delta, // one_div_delta_window, tau_scale} ScalarType* parameters_real_ = nullptr; }; template <typename ScalarType> class NfftHelperManager : public NfftHelper<ScalarType> { public: NfftHelperManager() = default; __host__ ~NfftHelperManager(); __host__ __device__ NfftHelperManager(const NfftHelperManager& other) = delete; __host__ void set(int nb, int nr, const int* sub_r, int lds, int oversampling, int window_sampling, ScalarType t0, ScalarType delta_t, ScalarType t0_window, ScalarType delta_t_window, ScalarType beta); int get_oversampling() const { assert(NfftHelper<ScalarType>::isInitialized()); return oversampling_; } private: int oversampling_ = -1; }; template <typename ScalarType> __host__ void NfftHelper<ScalarType>::set(const int nb, const int nr, const int* sub_r, int lds, int oversampling, int window_sampling, ScalarType t0, ScalarType delta_t, const ScalarType t0_window, const ScalarType delta_t_window, const ScalarType beta) { if (isInitialized()) throw(std::logic_error("already initialized.")); cudaMalloc(&sub_matrix_, sizeof(int) * lds * nr); cudaMemcpy(sub_matrix_, sub_r, sizeof(int) * lds * nr, cudaMemcpyHostToDevice); const std::array<int, 5> parameters_int_host{nb, nr, lds, oversampling, window_sampling}; cudaMalloc(¶meters_int_, sizeof(int) * parameters_int_host.size()); cudaMemcpy(parameters_int_, parameters_int_host.data(), sizeof(int) * parameters_int_host.size(), cudaMemcpyHostToDevice); ScalarType one(1); const ScalarType tau_scale = ScalarType(0.5) / beta; const std::array<ScalarType, 7> parameters_real_host{ t0, delta_t, t0_window, delta_t_window, one / delta_t, one / delta_t_window, tau_scale}; cudaMalloc(¶meters_real_, sizeof(ScalarType) * parameters_real_host.size()); cudaMemcpy(parameters_real_, parameters_real_host.data(), sizeof(ScalarType) * parameters_real_host.size(), cudaMemcpyHostToDevice); } template <typename ScalarType> __device__ int NfftHelper<ScalarType>::deltaR(const int r1, const int r2) const { const int ld = parameters_int_[2]; return sub_matrix_[r1 + ld * r2]; } template <typename ScalarType> __host__ void NfftHelperManager<ScalarType>::set(const int nb, const int nr, const int* sub_r, int lds, int oversampling, int window_sampling, ScalarType t0, ScalarType delta_t, const ScalarType t0_window, const ScalarType delta_t_window, const ScalarType beta) { NfftHelper<ScalarType>::set(nb, nr, sub_r, lds, oversampling, window_sampling, t0, delta_t, t0_window, delta_t_window, beta); oversampling_ = oversampling; } template <typename ScalarType> __device__ int inline NfftHelper<ScalarType>::computeLinearIndex(const int b1, const int b2, const int r1, const int r2) const { const int delta_r = deltaR(r2, r1); const int nb = parameters_int_[0]; return b1 + nb * (b2 + nb * delta_r); } template <typename ScalarType> __device__ ScalarType NfftHelper<ScalarType>::computeTau(ScalarType t1, ScalarType t2) const { const ScalarType tau_scale = parameters_real_[6]; return (t1 - t2) * tau_scale; } template <typename ScalarType> template <NfftModeNames mode> __device__ void NfftHelper<ScalarType>::computeInterpolationIndices(ScalarType t, int& t_idx, int& conv_coeff_idx, ScalarType& delta_t) const { static_assert(mode == CUBIC || mode == LINEAR, "This method is defined only for linear or cubic interpolations."); const ScalarType t0 = parameters_real_[0]; const ScalarType delta_t_padded = parameters_real_[1]; const ScalarType t0_window = parameters_real_[2]; const ScalarType delta_t_window = parameters_real_[3]; auto get_tau = [=](int index) { return t0 + index * delta_t_padded; }; auto get_window_tau = [&](int index) { return t0_window + index * delta_t_window; }; const ScalarType one_div_delta = parameters_real_[4]; const ScalarType one_div_delta_wnd = parameters_real_[5]; t_idx = (t - t0) * one_div_delta; const int t_wnd_idx = (get_tau(t_idx) - t - t0_window) * one_div_delta_wnd; delta_t = get_tau(t_idx) - t - get_window_tau(t_wnd_idx); const int oversampling = parameters_int_[3]; const int window_sampling = parameters_int_[4]; t_idx -= oversampling; const int delta_tau_index = t_wnd_idx - oversampling * window_sampling; const int j = delta_tau_index % window_sampling; const int i = (delta_tau_index - j) / window_sampling; constexpr int interpolation_size = mode == CUBIC ? 4 : 2; conv_coeff_idx = interpolation_size * i + interpolation_size * 4 * oversampling * j; } template <typename ScalarType> __host__ NfftHelperManager<ScalarType>::~NfftHelperManager() { cudaFree(NfftHelper<ScalarType>::sub_matrix_); cudaFree(NfftHelper<ScalarType>::parameters_int_); cudaFree(NfftHelper<ScalarType>::parameters_real_); } } // details } // nfft } // math } // dca #endif // DCA_INCLUDE_DCA_MATH_NFFT_NFFT_HELPER