Loading include/dca/math/nfft/dnfft_1d.hpp +8 −8 Original line number Diff line number Diff line Loading @@ -275,10 +275,10 @@ void Dnfft1D<ScalarType, WDmn, PDmn, oversampling, mode>::convoluteToFTauExact( const ScalarType T_0 = PaddedTimeDmn::parameter_type::first_element(); const ScalarType one_div_Delta = PaddedTimeDmn::parameter_type::get_one_div_Delta(); int lambda_0 = (t_val - T_0) * one_div_Delta; const int lambda_0 = (t_val - T_0) * one_div_Delta; for (int l = -oversampling; l <= oversampling; ++l) f_tau_(lambda_0 + l, index) += f_val * WindowFunction::phi_t(tau(lambda_0 + l) - t_val); for (int l = lambda_0 - oversampling + 1; l <= lambda_0 + oversampling; ++l) f_tau_(l, index) += f_val * WindowFunction::phi_t(tau(l) - t_val); } template <typename ScalarType, typename WDmn, typename PDmn, int oversampling, NfftModeNames mode> Loading Loading @@ -316,7 +316,7 @@ inline void Dnfft1D<ScalarType, WDmn, PDmn, oversampling, mode>::convoluteToFTau ScalarType* f_tau_ptr = &f_tau_(tau_index, index); ScalarType* matrix_ptr = &get_linear_convolution_matrices()(0, i, j); nfft_atomic_convolution<2 * oversampling + 1, 0>::execute_linear(f_tau_ptr, matrix_ptr, y_ptr); NfftAtomicConvolution<oversampling>::execute_linear(f_tau_ptr, matrix_ptr, y_ptr); } template <typename ScalarType, typename WDmn, typename PDmn, int oversampling, NfftModeNames mode> Loading Loading @@ -364,7 +364,7 @@ inline void Dnfft1D<ScalarType, WDmn, PDmn, oversampling, mode>::convoluteToFTau ScalarType* f_tau_ptr = &f_tau_(tau_index, index); ScalarType* matrix_ptr = &get_cubic_convolution_matrices()(0, i, j); nfft_atomic_convolution<2 * oversampling + 1, 0>::execute_cubic(f_tau_ptr, matrix_ptr, y_ptr); NfftAtomicConvolution<oversampling>::execute_cubic(f_tau_ptr, matrix_ptr, y_ptr); } template <typename ScalarType, typename WDmn, typename PDmn, int oversampling, NfftModeNames mode> Loading Loading @@ -481,8 +481,8 @@ auto& Dnfft1D<ScalarType, WDmn, PDmn, oversampling, mode>::get_phi_wn() { return phi_wn; } } // nfft } // math } // dca } // namespace nfft } // namespace math } // namespace dca #endif // DCA_MATH_NFFT_DNFFT_1D_HPP include/dca/math/nfft/nfft_atomic_convolution.hpp +25 −10 Original line number Diff line number Diff line Loading @@ -18,12 +18,12 @@ namespace nfft { // dca::math::nfft:: template <int max_count, int count> struct nfft_atomic_convolution { struct NfftAtomicConvolutionImpl { template <typename ScalarType> inline static void execute_linear(ScalarType* f, const ScalarType* M, const ScalarType* y) { f[count] += (M[0 + 2 * count] * y[0] + M[1 + 2 * count] * y[1]); nfft_atomic_convolution<max_count, count + 1>::execute_linear(f, M, y); NfftAtomicConvolutionImpl<max_count, count + 1>::execute_linear(f, M, y); } template <typename ScalarType> Loading @@ -31,7 +31,7 @@ struct nfft_atomic_convolution { f[count] += (M[0 + 4 * count] * y[0] + M[1 + 4 * count] * y[1] + M[2 + 4 * count] * y[2] + M[3 + 4 * count] * y[3]); nfft_atomic_convolution<max_count, count + 1>::execute_cubic(f, M, y); NfftAtomicConvolutionImpl<max_count, count + 1>::execute_cubic(f, M, y); } template <typename ScalarType> Loading @@ -54,14 +54,14 @@ struct nfft_atomic_convolution { inline static void execute_Mt_y_1(ScalarType* f, const ScalarType* M, const ScalarType* y) { f[count] += M[count] * y[0]; nfft_atomic_convolution<max_count, count + 1>::execute_Mt_y_1(f, M, y); NfftAtomicConvolutionImpl<max_count, count + 1>::execute_Mt_y_1(f, M, y); } template <typename ScalarType> inline static void execute_Mt_y_2(ScalarType* f, const ScalarType* M, const ScalarType* y) { f[count] += (M[0 + 2 * count] * y[0] + M[1 + 2 * count] * y[1]); nfft_atomic_convolution<max_count, count + 1>::execute_Mt_y_2(f, M, y); NfftAtomicConvolutionImpl<max_count, count + 1>::execute_Mt_y_2(f, M, y); } template <typename ScalarType> Loading @@ -69,12 +69,12 @@ struct nfft_atomic_convolution { f[count] += (M[0 + 4 * count] * y[0] + M[1 + 4 * count] * y[1] + M[2 + 4 * count] * y[2] + M[3 + 4 * count] * y[3]); nfft_atomic_convolution<max_count, count + 1>::execute_Mt_y_4(f, M, y); NfftAtomicConvolutionImpl<max_count, count + 1>::execute_Mt_y_4(f, M, y); } }; template <int max_count> struct nfft_atomic_convolution<max_count, max_count> { struct NfftAtomicConvolutionImpl<max_count, max_count> { template <typename ScalarType> inline static void execute_linear(ScalarType* /*f*/, const ScalarType* /*y*/, const ScalarType* /*M*/) {} Loading @@ -92,8 +92,23 @@ struct nfft_atomic_convolution<max_count, max_count> { const ScalarType* /*y*/) {} }; } // nfft } // math } // dca template <int oversampling> struct NfftAtomicConvolution { template <typename ScalarType> inline static void execute_linear(ScalarType* f, const ScalarType* y, const ScalarType* M) { static_assert(oversampling > 1, "Invalid oversampling size."); NfftAtomicConvolutionImpl<2 * oversampling + 1, 1>::execute_linear(f, y, M); } template <typename ScalarType> inline static void execute_cubic(ScalarType* f, const ScalarType* y, const ScalarType* M) { static_assert(oversampling > 1, "Invalid oversampling size."); NfftAtomicConvolutionImpl<2 * oversampling + 1, 1>::execute_cubic(f, y, M); } }; } // namespace nfft } // namespace math } // namespace dca #endif // DCA_MATH_NFFT_NFFT_ATOMIC_CONVOLUTION_HPP src/math/nfft/dnfft_1d_kernels.cu +5 −7 Original line number Diff line number Diff line Loading @@ -47,10 +47,10 @@ template <int oversampling, int window_sampling, typename ScalarIn, typename Sca bool accumulate_m_sqr = false> __global__ void accumulateOnDeviceKernel( const ScalarIn* __restrict__ M, const int ldm, const ScalarIn sign, ScalarOut* __restrict__ out, ScalarOut* __restrict__ out_sqr, int ldo, const ConfigElem* config_left, const ConfigElem* config_right, const ScalarIn* __restrict__ times, ScalarOut* __restrict__ out_sqr, int ldo, const ConfigElem* __restrict__ config_left, const ConfigElem* __restrict__ config_right, const ScalarIn* __restrict__ times, const ScalarOut* __restrict__ cubic_coeff, const int m_size) { constexpr int conv_size = 2 * oversampling + 1; constexpr int conv_size = 2 * oversampling; int thread_idx = blockIdx.x * blockDim.x + threadIdx.x; if (thread_idx >= m_size * m_size * conv_size) return; Loading @@ -59,7 +59,7 @@ __global__ void accumulateOnDeviceKernel( const int m_j = thread_idx / (m_size * conv_size); thread_idx -= m_j * (m_size * conv_size); const int m_i = thread_idx / conv_size; const int conv_idx = thread_idx - m_i * conv_size; const int conv_idx = thread_idx - m_i * conv_size + 1; const ScalarOut tau = nfft_helper.computeTau(times[m_i], times[m_j]); Loading Loading @@ -89,10 +89,8 @@ void accumulateOnDevice(const ScalarIn* M, const int ldm, const ScalarIn sign, S ScalarOut* out_sqr, const int ldo, const ConfigElem* config_left, const ConfigElem* config_right, const ScalarIn* tau, const ScalarOut* cubic_coeff, const int size, cudaStream_t stream_) { const auto blocks = getBlockSize(size * size * (2 * oversampling + 1), 128); const auto blocks = getBlockSize(size * size * (2 * oversampling), 128); // TODO: check if there is a performance gain in using a block size that is a multiple of // convolution_size. if (out_sqr) { accumulateOnDeviceKernel<oversampling, window_sampling, ScalarIn, ScalarOut, true> <<<blocks[0], blocks[1], 0, stream_>>>(M, ldm, sign, out, out_sqr, ldo, config_left, Loading Loading
include/dca/math/nfft/dnfft_1d.hpp +8 −8 Original line number Diff line number Diff line Loading @@ -275,10 +275,10 @@ void Dnfft1D<ScalarType, WDmn, PDmn, oversampling, mode>::convoluteToFTauExact( const ScalarType T_0 = PaddedTimeDmn::parameter_type::first_element(); const ScalarType one_div_Delta = PaddedTimeDmn::parameter_type::get_one_div_Delta(); int lambda_0 = (t_val - T_0) * one_div_Delta; const int lambda_0 = (t_val - T_0) * one_div_Delta; for (int l = -oversampling; l <= oversampling; ++l) f_tau_(lambda_0 + l, index) += f_val * WindowFunction::phi_t(tau(lambda_0 + l) - t_val); for (int l = lambda_0 - oversampling + 1; l <= lambda_0 + oversampling; ++l) f_tau_(l, index) += f_val * WindowFunction::phi_t(tau(l) - t_val); } template <typename ScalarType, typename WDmn, typename PDmn, int oversampling, NfftModeNames mode> Loading Loading @@ -316,7 +316,7 @@ inline void Dnfft1D<ScalarType, WDmn, PDmn, oversampling, mode>::convoluteToFTau ScalarType* f_tau_ptr = &f_tau_(tau_index, index); ScalarType* matrix_ptr = &get_linear_convolution_matrices()(0, i, j); nfft_atomic_convolution<2 * oversampling + 1, 0>::execute_linear(f_tau_ptr, matrix_ptr, y_ptr); NfftAtomicConvolution<oversampling>::execute_linear(f_tau_ptr, matrix_ptr, y_ptr); } template <typename ScalarType, typename WDmn, typename PDmn, int oversampling, NfftModeNames mode> Loading Loading @@ -364,7 +364,7 @@ inline void Dnfft1D<ScalarType, WDmn, PDmn, oversampling, mode>::convoluteToFTau ScalarType* f_tau_ptr = &f_tau_(tau_index, index); ScalarType* matrix_ptr = &get_cubic_convolution_matrices()(0, i, j); nfft_atomic_convolution<2 * oversampling + 1, 0>::execute_cubic(f_tau_ptr, matrix_ptr, y_ptr); NfftAtomicConvolution<oversampling>::execute_cubic(f_tau_ptr, matrix_ptr, y_ptr); } template <typename ScalarType, typename WDmn, typename PDmn, int oversampling, NfftModeNames mode> Loading Loading @@ -481,8 +481,8 @@ auto& Dnfft1D<ScalarType, WDmn, PDmn, oversampling, mode>::get_phi_wn() { return phi_wn; } } // nfft } // math } // dca } // namespace nfft } // namespace math } // namespace dca #endif // DCA_MATH_NFFT_DNFFT_1D_HPP
include/dca/math/nfft/nfft_atomic_convolution.hpp +25 −10 Original line number Diff line number Diff line Loading @@ -18,12 +18,12 @@ namespace nfft { // dca::math::nfft:: template <int max_count, int count> struct nfft_atomic_convolution { struct NfftAtomicConvolutionImpl { template <typename ScalarType> inline static void execute_linear(ScalarType* f, const ScalarType* M, const ScalarType* y) { f[count] += (M[0 + 2 * count] * y[0] + M[1 + 2 * count] * y[1]); nfft_atomic_convolution<max_count, count + 1>::execute_linear(f, M, y); NfftAtomicConvolutionImpl<max_count, count + 1>::execute_linear(f, M, y); } template <typename ScalarType> Loading @@ -31,7 +31,7 @@ struct nfft_atomic_convolution { f[count] += (M[0 + 4 * count] * y[0] + M[1 + 4 * count] * y[1] + M[2 + 4 * count] * y[2] + M[3 + 4 * count] * y[3]); nfft_atomic_convolution<max_count, count + 1>::execute_cubic(f, M, y); NfftAtomicConvolutionImpl<max_count, count + 1>::execute_cubic(f, M, y); } template <typename ScalarType> Loading @@ -54,14 +54,14 @@ struct nfft_atomic_convolution { inline static void execute_Mt_y_1(ScalarType* f, const ScalarType* M, const ScalarType* y) { f[count] += M[count] * y[0]; nfft_atomic_convolution<max_count, count + 1>::execute_Mt_y_1(f, M, y); NfftAtomicConvolutionImpl<max_count, count + 1>::execute_Mt_y_1(f, M, y); } template <typename ScalarType> inline static void execute_Mt_y_2(ScalarType* f, const ScalarType* M, const ScalarType* y) { f[count] += (M[0 + 2 * count] * y[0] + M[1 + 2 * count] * y[1]); nfft_atomic_convolution<max_count, count + 1>::execute_Mt_y_2(f, M, y); NfftAtomicConvolutionImpl<max_count, count + 1>::execute_Mt_y_2(f, M, y); } template <typename ScalarType> Loading @@ -69,12 +69,12 @@ struct nfft_atomic_convolution { f[count] += (M[0 + 4 * count] * y[0] + M[1 + 4 * count] * y[1] + M[2 + 4 * count] * y[2] + M[3 + 4 * count] * y[3]); nfft_atomic_convolution<max_count, count + 1>::execute_Mt_y_4(f, M, y); NfftAtomicConvolutionImpl<max_count, count + 1>::execute_Mt_y_4(f, M, y); } }; template <int max_count> struct nfft_atomic_convolution<max_count, max_count> { struct NfftAtomicConvolutionImpl<max_count, max_count> { template <typename ScalarType> inline static void execute_linear(ScalarType* /*f*/, const ScalarType* /*y*/, const ScalarType* /*M*/) {} Loading @@ -92,8 +92,23 @@ struct nfft_atomic_convolution<max_count, max_count> { const ScalarType* /*y*/) {} }; } // nfft } // math } // dca template <int oversampling> struct NfftAtomicConvolution { template <typename ScalarType> inline static void execute_linear(ScalarType* f, const ScalarType* y, const ScalarType* M) { static_assert(oversampling > 1, "Invalid oversampling size."); NfftAtomicConvolutionImpl<2 * oversampling + 1, 1>::execute_linear(f, y, M); } template <typename ScalarType> inline static void execute_cubic(ScalarType* f, const ScalarType* y, const ScalarType* M) { static_assert(oversampling > 1, "Invalid oversampling size."); NfftAtomicConvolutionImpl<2 * oversampling + 1, 1>::execute_cubic(f, y, M); } }; } // namespace nfft } // namespace math } // namespace dca #endif // DCA_MATH_NFFT_NFFT_ATOMIC_CONVOLUTION_HPP
src/math/nfft/dnfft_1d_kernels.cu +5 −7 Original line number Diff line number Diff line Loading @@ -47,10 +47,10 @@ template <int oversampling, int window_sampling, typename ScalarIn, typename Sca bool accumulate_m_sqr = false> __global__ void accumulateOnDeviceKernel( const ScalarIn* __restrict__ M, const int ldm, const ScalarIn sign, ScalarOut* __restrict__ out, ScalarOut* __restrict__ out_sqr, int ldo, const ConfigElem* config_left, const ConfigElem* config_right, const ScalarIn* __restrict__ times, ScalarOut* __restrict__ out_sqr, int ldo, const ConfigElem* __restrict__ config_left, const ConfigElem* __restrict__ config_right, const ScalarIn* __restrict__ times, const ScalarOut* __restrict__ cubic_coeff, const int m_size) { constexpr int conv_size = 2 * oversampling + 1; constexpr int conv_size = 2 * oversampling; int thread_idx = blockIdx.x * blockDim.x + threadIdx.x; if (thread_idx >= m_size * m_size * conv_size) return; Loading @@ -59,7 +59,7 @@ __global__ void accumulateOnDeviceKernel( const int m_j = thread_idx / (m_size * conv_size); thread_idx -= m_j * (m_size * conv_size); const int m_i = thread_idx / conv_size; const int conv_idx = thread_idx - m_i * conv_size; const int conv_idx = thread_idx - m_i * conv_size + 1; const ScalarOut tau = nfft_helper.computeTau(times[m_i], times[m_j]); Loading Loading @@ -89,10 +89,8 @@ void accumulateOnDevice(const ScalarIn* M, const int ldm, const ScalarIn sign, S ScalarOut* out_sqr, const int ldo, const ConfigElem* config_left, const ConfigElem* config_right, const ScalarIn* tau, const ScalarOut* cubic_coeff, const int size, cudaStream_t stream_) { const auto blocks = getBlockSize(size * size * (2 * oversampling + 1), 128); const auto blocks = getBlockSize(size * size * (2 * oversampling), 128); // TODO: check if there is a performance gain in using a block size that is a multiple of // convolution_size. if (out_sqr) { accumulateOnDeviceKernel<oversampling, window_sampling, ScalarIn, ScalarOut, true> <<<blocks[0], blocks[1], 0, stream_>>>(M, ldm, sign, out, out_sqr, ldo, config_left, Loading