Commit 41c393bc authored by gbalduzz's avatar gbalduzz
Browse files

Skip furthest convolution point

parent b44c006e
Loading
Loading
Loading
Loading
+8 −8
Original line number Diff line number Diff line
@@ -275,10 +275,10 @@ void Dnfft1D<ScalarType, WDmn, PDmn, oversampling, mode>::convoluteToFTauExact(
  const ScalarType T_0 = PaddedTimeDmn::parameter_type::first_element();
  const ScalarType one_div_Delta = PaddedTimeDmn::parameter_type::get_one_div_Delta();

  int lambda_0 = (t_val - T_0) * one_div_Delta;
  const int lambda_0 = (t_val - T_0) * one_div_Delta;

  for (int l = -oversampling; l <= oversampling; ++l)
    f_tau_(lambda_0 + l, index) += f_val * WindowFunction::phi_t(tau(lambda_0 + l) - t_val);
  for (int l = lambda_0 - oversampling + 1; l <= lambda_0 + oversampling; ++l)
    f_tau_(l, index) += f_val * WindowFunction::phi_t(tau(l) - t_val);
}

template <typename ScalarType, typename WDmn, typename PDmn, int oversampling, NfftModeNames mode>
@@ -316,7 +316,7 @@ inline void Dnfft1D<ScalarType, WDmn, PDmn, oversampling, mode>::convoluteToFTau
  ScalarType* f_tau_ptr = &f_tau_(tau_index, index);
  ScalarType* matrix_ptr = &get_linear_convolution_matrices()(0, i, j);

  nfft_atomic_convolution<2 * oversampling + 1, 0>::execute_linear(f_tau_ptr, matrix_ptr, y_ptr);
  NfftAtomicConvolution<oversampling>::execute_linear(f_tau_ptr, matrix_ptr, y_ptr);
}

template <typename ScalarType, typename WDmn, typename PDmn, int oversampling, NfftModeNames mode>
@@ -364,7 +364,7 @@ inline void Dnfft1D<ScalarType, WDmn, PDmn, oversampling, mode>::convoluteToFTau
  ScalarType* f_tau_ptr = &f_tau_(tau_index, index);
  ScalarType* matrix_ptr = &get_cubic_convolution_matrices()(0, i, j);

  nfft_atomic_convolution<2 * oversampling + 1, 0>::execute_cubic(f_tau_ptr, matrix_ptr, y_ptr);
  NfftAtomicConvolution<oversampling>::execute_cubic(f_tau_ptr, matrix_ptr, y_ptr);
}

template <typename ScalarType, typename WDmn, typename PDmn, int oversampling, NfftModeNames mode>
@@ -481,8 +481,8 @@ auto& Dnfft1D<ScalarType, WDmn, PDmn, oversampling, mode>::get_phi_wn() {
  return phi_wn;
}

}  // nfft
}  // math
}  // dca
}  // namespace nfft
}  // namespace math
}  // namespace dca

#endif  // DCA_MATH_NFFT_DNFFT_1D_HPP
+25 −10
Original line number Diff line number Diff line
@@ -18,12 +18,12 @@ namespace nfft {
// dca::math::nfft::

template <int max_count, int count>
struct nfft_atomic_convolution {
struct NfftAtomicConvolutionImpl {
  template <typename ScalarType>
  inline static void execute_linear(ScalarType* f, const ScalarType* M, const ScalarType* y) {
    f[count] += (M[0 + 2 * count] * y[0] + M[1 + 2 * count] * y[1]);

    nfft_atomic_convolution<max_count, count + 1>::execute_linear(f, M, y);
    NfftAtomicConvolutionImpl<max_count, count + 1>::execute_linear(f, M, y);
  }

  template <typename ScalarType>
@@ -31,7 +31,7 @@ struct nfft_atomic_convolution {
    f[count] += (M[0 + 4 * count] * y[0] + M[1 + 4 * count] * y[1] + M[2 + 4 * count] * y[2] +
                 M[3 + 4 * count] * y[3]);

    nfft_atomic_convolution<max_count, count + 1>::execute_cubic(f, M, y);
    NfftAtomicConvolutionImpl<max_count, count + 1>::execute_cubic(f, M, y);
  }

  template <typename ScalarType>
@@ -54,14 +54,14 @@ struct nfft_atomic_convolution {
  inline static void execute_Mt_y_1(ScalarType* f, const ScalarType* M, const ScalarType* y) {
    f[count] += M[count] * y[0];

    nfft_atomic_convolution<max_count, count + 1>::execute_Mt_y_1(f, M, y);
    NfftAtomicConvolutionImpl<max_count, count + 1>::execute_Mt_y_1(f, M, y);
  }

  template <typename ScalarType>
  inline static void execute_Mt_y_2(ScalarType* f, const ScalarType* M, const ScalarType* y) {
    f[count] += (M[0 + 2 * count] * y[0] + M[1 + 2 * count] * y[1]);

    nfft_atomic_convolution<max_count, count + 1>::execute_Mt_y_2(f, M, y);
    NfftAtomicConvolutionImpl<max_count, count + 1>::execute_Mt_y_2(f, M, y);
  }

  template <typename ScalarType>
@@ -69,12 +69,12 @@ struct nfft_atomic_convolution {
    f[count] += (M[0 + 4 * count] * y[0] + M[1 + 4 * count] * y[1] + M[2 + 4 * count] * y[2] +
                 M[3 + 4 * count] * y[3]);

    nfft_atomic_convolution<max_count, count + 1>::execute_Mt_y_4(f, M, y);
    NfftAtomicConvolutionImpl<max_count, count + 1>::execute_Mt_y_4(f, M, y);
  }
};

template <int max_count>
struct nfft_atomic_convolution<max_count, max_count> {
struct NfftAtomicConvolutionImpl<max_count, max_count> {
  template <typename ScalarType>
  inline static void execute_linear(ScalarType* /*f*/, const ScalarType* /*y*/,
                                    const ScalarType* /*M*/) {}
@@ -92,8 +92,23 @@ struct nfft_atomic_convolution<max_count, max_count> {
                                    const ScalarType* /*y*/) {}
};

}  // nfft
}  // math
}  // dca
template <int oversampling>
struct NfftAtomicConvolution {
  template <typename ScalarType>
  inline static void execute_linear(ScalarType* f, const ScalarType* y, const ScalarType* M) {
    static_assert(oversampling > 1, "Invalid oversampling size.");
    NfftAtomicConvolutionImpl<2 * oversampling + 1, 1>::execute_linear(f, y, M);
  }

  template <typename ScalarType>
  inline static void execute_cubic(ScalarType* f, const ScalarType* y, const ScalarType* M) {
    static_assert(oversampling > 1, "Invalid oversampling size.");
    NfftAtomicConvolutionImpl<2 * oversampling + 1, 1>::execute_cubic(f, y, M);
  }
};

}  // namespace nfft
}  // namespace math
}  // namespace dca

#endif  // DCA_MATH_NFFT_NFFT_ATOMIC_CONVOLUTION_HPP
+5 −7
Original line number Diff line number Diff line
@@ -47,10 +47,10 @@ template <int oversampling, int window_sampling, typename ScalarIn, typename Sca
          bool accumulate_m_sqr = false>
__global__ void accumulateOnDeviceKernel(
    const ScalarIn* __restrict__ M, const int ldm, const ScalarIn sign, ScalarOut* __restrict__ out,
    ScalarOut* __restrict__ out_sqr, int ldo, const ConfigElem* config_left,
    const ConfigElem* config_right, const ScalarIn* __restrict__ times,
    ScalarOut* __restrict__ out_sqr, int ldo, const ConfigElem* __restrict__ config_left,
    const ConfigElem* __restrict__ config_right, const ScalarIn* __restrict__ times,
    const ScalarOut* __restrict__ cubic_coeff, const int m_size) {
  constexpr int conv_size = 2 * oversampling + 1;
  constexpr int conv_size = 2 * oversampling;
  int thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (thread_idx >= m_size * m_size * conv_size)
    return;
@@ -59,7 +59,7 @@ __global__ void accumulateOnDeviceKernel(
  const int m_j = thread_idx / (m_size * conv_size);
  thread_idx -= m_j * (m_size * conv_size);
  const int m_i = thread_idx / conv_size;
  const int conv_idx = thread_idx - m_i * conv_size;
  const int conv_idx = thread_idx - m_i * conv_size + 1;

  const ScalarOut tau = nfft_helper.computeTau(times[m_i], times[m_j]);

@@ -89,10 +89,8 @@ void accumulateOnDevice(const ScalarIn* M, const int ldm, const ScalarIn sign, S
                        ScalarOut* out_sqr, const int ldo, const ConfigElem* config_left,
                        const ConfigElem* config_right, const ScalarIn* tau,
                        const ScalarOut* cubic_coeff, const int size, cudaStream_t stream_) {
  const auto blocks = getBlockSize(size * size * (2 * oversampling + 1), 128);
  const auto blocks = getBlockSize(size * size * (2 * oversampling), 128);

  // TODO: check if there is a performance gain in using a block size that is a multiple of
  //       convolution_size.
  if (out_sqr) {
    accumulateOnDeviceKernel<oversampling, window_sampling, ScalarIn, ScalarOut, true>
        <<<blocks[0], blocks[1], 0, stream_>>>(M, ldm, sign, out, out_sqr, ldo, config_left,