Skip furthest convolution point (41c393bc) · Commits · NDIP / Tool Sources / Direct-Geometry Spectroscopy / DCA / DCA Main

include/dca/math/nfft/dnfft_1d.hpp

+8 −8

Original line number	Diff line number	Diff line
		@@ -275,10 +275,10 @@ void Dnfft1D<ScalarType, WDmn, PDmn, oversampling, mode>::convoluteToFTauExact(
		const ScalarType T_0 = PaddedTimeDmn::parameter_type::first_element();
		const ScalarType one_div_Delta = PaddedTimeDmn::parameter_type::get_one_div_Delta();

		int lambda_0 = (t_val - T_0) * one_div_Delta;
		const int lambda_0 = (t_val - T_0) * one_div_Delta;

		for (int l = -oversampling; l <= oversampling; ++l)
		f_tau_(lambda_0 + l, index) += f_val * WindowFunction::phi_t(tau(lambda_0 + l) - t_val);
		for (int l = lambda_0 - oversampling + 1; l <= lambda_0 + oversampling; ++l)
		f_tau_(l, index) += f_val * WindowFunction::phi_t(tau(l) - t_val);
		}

		template <typename ScalarType, typename WDmn, typename PDmn, int oversampling, NfftModeNames mode>
		@@ -316,7 +316,7 @@ inline void Dnfft1D<ScalarType, WDmn, PDmn, oversampling, mode>::convoluteToFTau
		ScalarType* f_tau_ptr = &f_tau_(tau_index, index);
		ScalarType* matrix_ptr = &get_linear_convolution_matrices()(0, i, j);

		nfft_atomic_convolution<2 * oversampling + 1, 0>::execute_linear(f_tau_ptr, matrix_ptr, y_ptr);
		NfftAtomicConvolution<oversampling>::execute_linear(f_tau_ptr, matrix_ptr, y_ptr);
		}

		template <typename ScalarType, typename WDmn, typename PDmn, int oversampling, NfftModeNames mode>
		@@ -364,7 +364,7 @@ inline void Dnfft1D<ScalarType, WDmn, PDmn, oversampling, mode>::convoluteToFTau
		ScalarType* f_tau_ptr = &f_tau_(tau_index, index);
		ScalarType* matrix_ptr = &get_cubic_convolution_matrices()(0, i, j);

		nfft_atomic_convolution<2 * oversampling + 1, 0>::execute_cubic(f_tau_ptr, matrix_ptr, y_ptr);
		NfftAtomicConvolution<oversampling>::execute_cubic(f_tau_ptr, matrix_ptr, y_ptr);
		}

		template <typename ScalarType, typename WDmn, typename PDmn, int oversampling, NfftModeNames mode>
		@@ -481,8 +481,8 @@ auto& Dnfft1D<ScalarType, WDmn, PDmn, oversampling, mode>::get_phi_wn() {
		return phi_wn;
		}

		} // nfft
		} // math
		} // dca
		} // namespace nfft
		} // namespace math
		} // namespace dca

		#endif // DCA_MATH_NFFT_DNFFT_1D_HPP

include/dca/math/nfft/nfft_atomic_convolution.hpp

+25 −10

Original line number	Diff line number	Diff line
		@@ -18,12 +18,12 @@ namespace nfft {
		// dca::math::nfft::

		template <int max_count, int count>
		struct nfft_atomic_convolution {
		struct NfftAtomicConvolutionImpl {
		template <typename ScalarType>
		inline static void execute_linear(ScalarType* f, const ScalarType* M, const ScalarType* y) {
		f[count] += (M[0 + 2 * count] * y[0] + M[1 + 2 * count] * y[1]);

		nfft_atomic_convolution<max_count, count + 1>::execute_linear(f, M, y);
		NfftAtomicConvolutionImpl<max_count, count + 1>::execute_linear(f, M, y);
		}

		template <typename ScalarType>
		@@ -31,7 +31,7 @@ struct nfft_atomic_convolution {
		f[count] += (M[0 + 4 * count] * y[0] + M[1 + 4 * count] * y[1] + M[2 + 4 * count] * y[2] +
		M[3 + 4 * count] * y[3]);

		nfft_atomic_convolution<max_count, count + 1>::execute_cubic(f, M, y);
		NfftAtomicConvolutionImpl<max_count, count + 1>::execute_cubic(f, M, y);
		}

		template <typename ScalarType>
		@@ -54,14 +54,14 @@ struct nfft_atomic_convolution {
		inline static void execute_Mt_y_1(ScalarType* f, const ScalarType* M, const ScalarType* y) {
		f[count] += M[count] * y[0];

		nfft_atomic_convolution<max_count, count + 1>::execute_Mt_y_1(f, M, y);
		NfftAtomicConvolutionImpl<max_count, count + 1>::execute_Mt_y_1(f, M, y);
		}

		template <typename ScalarType>
		inline static void execute_Mt_y_2(ScalarType* f, const ScalarType* M, const ScalarType* y) {
		f[count] += (M[0 + 2 * count] * y[0] + M[1 + 2 * count] * y[1]);

		nfft_atomic_convolution<max_count, count + 1>::execute_Mt_y_2(f, M, y);
		NfftAtomicConvolutionImpl<max_count, count + 1>::execute_Mt_y_2(f, M, y);
		}

		template <typename ScalarType>
		@@ -69,12 +69,12 @@ struct nfft_atomic_convolution {
		f[count] += (M[0 + 4 * count] * y[0] + M[1 + 4 * count] * y[1] + M[2 + 4 * count] * y[2] +
		M[3 + 4 * count] * y[3]);

		nfft_atomic_convolution<max_count, count + 1>::execute_Mt_y_4(f, M, y);
		NfftAtomicConvolutionImpl<max_count, count + 1>::execute_Mt_y_4(f, M, y);
		}
		};

		template <int max_count>
		struct nfft_atomic_convolution<max_count, max_count> {
		struct NfftAtomicConvolutionImpl<max_count, max_count> {
		template <typename ScalarType>
		inline static void execute_linear(ScalarType* /f/, const ScalarType* /y/,
		const ScalarType* /M/) {}
		@@ -92,8 +92,23 @@ struct nfft_atomic_convolution<max_count, max_count> {
		const ScalarType* /y/) {}
		};

		} // nfft
		} // math
		} // dca
		template <int oversampling>
		struct NfftAtomicConvolution {
		template <typename ScalarType>
		inline static void execute_linear(ScalarType* f, const ScalarType* y, const ScalarType* M) {
		static_assert(oversampling > 1, "Invalid oversampling size.");
		NfftAtomicConvolutionImpl<2 * oversampling + 1, 1>::execute_linear(f, y, M);
		}

		template <typename ScalarType>
		inline static void execute_cubic(ScalarType* f, const ScalarType* y, const ScalarType* M) {
		static_assert(oversampling > 1, "Invalid oversampling size.");
		NfftAtomicConvolutionImpl<2 * oversampling + 1, 1>::execute_cubic(f, y, M);
		}
		};

		} // namespace nfft
		} // namespace math
		} // namespace dca

		#endif // DCA_MATH_NFFT_NFFT_ATOMIC_CONVOLUTION_HPP

src/math/nfft/dnfft_1d_kernels.cu

+5 −7

Original line number	Diff line number	Diff line
		@@ -47,10 +47,10 @@ template <int oversampling, int window_sampling, typename ScalarIn, typename Sca
		bool accumulate_m_sqr = false>
		__global__ void accumulateOnDeviceKernel(
		const ScalarIn* __restrict__ M, const int ldm, const ScalarIn sign, ScalarOut* __restrict__ out,
		ScalarOut* __restrict__ out_sqr, int ldo, const ConfigElem* config_left,
		const ConfigElem* config_right, const ScalarIn* __restrict__ times,
		ScalarOut* __restrict__ out_sqr, int ldo, const ConfigElem* __restrict__ config_left,
		const ConfigElem* __restrict__ config_right, const ScalarIn* __restrict__ times,
		const ScalarOut* __restrict__ cubic_coeff, const int m_size) {
		constexpr int conv_size = 2 * oversampling + 1;
		constexpr int conv_size = 2 * oversampling;
		int thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
		if (thread_idx >= m_size * m_size * conv_size)
		return;
		@@ -59,7 +59,7 @@ __global__ void accumulateOnDeviceKernel(
		const int m_j = thread_idx / (m_size * conv_size);
		thread_idx -= m_j * (m_size * conv_size);
		const int m_i = thread_idx / conv_size;
		const int conv_idx = thread_idx - m_i * conv_size;
		const int conv_idx = thread_idx - m_i * conv_size + 1;

		const ScalarOut tau = nfft_helper.computeTau(times[m_i], times[m_j]);

		@@ -89,10 +89,8 @@ void accumulateOnDevice(const ScalarIn* M, const int ldm, const ScalarIn sign, S
		ScalarOut* out_sqr, const int ldo, const ConfigElem* config_left,
		const ConfigElem* config_right, const ScalarIn* tau,
		const ScalarOut* cubic_coeff, const int size, cudaStream_t stream_) {
		const auto blocks = getBlockSize(size * size * (2 * oversampling + 1), 128);
		const auto blocks = getBlockSize(size * size * (2 * oversampling), 128);

		// TODO: check if there is a performance gain in using a block size that is a multiple of
		// convolution_size.
		if (out_sqr) {
		accumulateOnDeviceKernel<oversampling, window_sampling, ScalarIn, ScalarOut, true>
		<<<blocks[0], blocks[1], 0, stream_>>>(M, ldm, sign, out, out_sqr, ldo, config_left,