Unverified Commit 1f9af577 authored by Peter Doak's avatar Peter Doak Committed by GitHub
Browse files

Merge pull request #165 from gbalduzz/optimize_sp_accum

Optimize sp accumulator
parents 7b6f95c4 5df418cd
Loading
Loading
Loading
Loading
+0 −1
Original line number Diff line number Diff line
@@ -151,7 +151,6 @@ set(DCA_LIBS
if (DCA_HAVE_CUDA)
  list(APPEND DCA_LIBS
    blas_kernels
    dnfft_kernels
    lapack_kernels
    mc_kernels
    special_transform_kernels
+2 −0
Original line number Diff line number Diff line
@@ -20,6 +20,8 @@ if (CUDA_FOUND)
  list(APPEND DCA_CUDA_LIBS ${CUDA_LIBRARIES} ${CUDA_cusparse_LIBRARY} ${CUDA_cublas_LIBRARY})
  CUDA_INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS})
  set(CUDA_SEPARABLE_COMPILATION ON)
  list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr")
  set(CMAKE_CUDA_STANDARD 14)

  set(CVD_LAUNCHER "" CACHE INTERNAL "launch script for setting the Cuda visible devices.")
  # Use the following script for systems with multiple gpus visible from a rank.
+8 −8
Original line number Diff line number Diff line
@@ -275,10 +275,10 @@ void Dnfft1D<ScalarType, WDmn, PDmn, oversampling, mode>::convoluteToFTauExact(
  const ScalarType T_0 = PaddedTimeDmn::parameter_type::first_element();
  const ScalarType one_div_Delta = PaddedTimeDmn::parameter_type::get_one_div_Delta();

  int lambda_0 = (t_val - T_0) * one_div_Delta;
  const int lambda_0 = (t_val - T_0) * one_div_Delta;

  for (int l = -oversampling; l <= oversampling; ++l)
    f_tau_(lambda_0 + l, index) += f_val * WindowFunction::phi_t(tau(lambda_0 + l) - t_val);
  for (int l = lambda_0 - oversampling + 1; l <= lambda_0 + oversampling; ++l)
    f_tau_(l, index) += f_val * WindowFunction::phi_t(tau(l) - t_val);
}

template <typename ScalarType, typename WDmn, typename PDmn, int oversampling, NfftModeNames mode>
@@ -316,7 +316,7 @@ inline void Dnfft1D<ScalarType, WDmn, PDmn, oversampling, mode>::convoluteToFTau
  ScalarType* f_tau_ptr = &f_tau_(tau_index, index);
  ScalarType* matrix_ptr = &get_linear_convolution_matrices()(0, i, j);

  nfft_atomic_convolution<2 * oversampling + 1, 0>::execute_linear(f_tau_ptr, matrix_ptr, y_ptr);
  NfftAtomicConvolution<oversampling>::execute_linear(f_tau_ptr, matrix_ptr, y_ptr);
}

template <typename ScalarType, typename WDmn, typename PDmn, int oversampling, NfftModeNames mode>
@@ -364,7 +364,7 @@ inline void Dnfft1D<ScalarType, WDmn, PDmn, oversampling, mode>::convoluteToFTau
  ScalarType* f_tau_ptr = &f_tau_(tau_index, index);
  ScalarType* matrix_ptr = &get_cubic_convolution_matrices()(0, i, j);

  nfft_atomic_convolution<2 * oversampling + 1, 0>::execute_cubic(f_tau_ptr, matrix_ptr, y_ptr);
  NfftAtomicConvolution<oversampling>::execute_cubic(f_tau_ptr, matrix_ptr, y_ptr);
}

template <typename ScalarType, typename WDmn, typename PDmn, int oversampling, NfftModeNames mode>
@@ -481,8 +481,8 @@ auto& Dnfft1D<ScalarType, WDmn, PDmn, oversampling, mode>::get_phi_wn() {
  return phi_wn;
}

}  // nfft
}  // math
}  // dca
}  // namespace nfft
}  // namespace math
}  // namespace dca

#endif  // DCA_MATH_NFFT_DNFFT_1D_HPP
+13 −13
Original line number Diff line number Diff line
@@ -145,13 +145,14 @@ void Dnfft1DGpu<ScalarType, WDmn, RDmn, oversampling, CUBIC>::initializeDeviceCo
               cudaMemcpyHostToDevice);

    const auto& sub_matrix = RDmn::parameter_type::get_subtract_matrix();
    const auto& add_matrix = RDmn::parameter_type::get_add_matrix();
    using PaddedTimeDmn = typename BaseClass::PaddedTimeDmn::parameter_type;
    using WindowTimeDmn = typename BaseClass::WindowFunctionTimeDmn::parameter_type;
    details::initializeNfftHelper<ScalarType>(
        BDmn::dmn_size(), RDmn::dmn_size(), sub_matrix.ptr(), sub_matrix.leadingDimension(),
        oversampling, BaseClass::get_window_sampling(), PaddedTimeDmn::first_element(),
        PaddedTimeDmn::get_Delta(), WindowTimeDmn::first_element(), WindowTimeDmn::get_delta(),
        beta_);
    details::initializeNfftHelper(BDmn::dmn_size(), RDmn::dmn_size(), add_matrix.ptr(),
                                  add_matrix.leadingDimension(), sub_matrix.ptr(),
                                  sub_matrix.leadingDimension(), PaddedTimeDmn::first_element(),
                                  PaddedTimeDmn::get_Delta(), WindowTimeDmn::first_element(),
                                  WindowTimeDmn::get_delta(), beta_);

    assert(cudaPeekAtLastError() == cudaSuccess);
  });
@@ -190,11 +191,10 @@ void Dnfft1DGpu<ScalarType, WDmn, RDmn, oversampling, CUBIC>::accumulate(
  config_left_dev_.setAsync(config_left_, stream_);
  times_dev_.setAsync(times_, stream_);

  details::accumulateOnDevice(M.ptr(), M.leadingDimension(), static_cast<ScalarType>(sign),
                              accumulation_matrix_.ptr(), accumulation_matrix_sqr_.ptr(),
                              accumulation_matrix_.leadingDimension(), config_left_dev_.ptr(),
                              config_right_dev_.ptr(), times_dev_.ptr(),
                              get_device_cubic_coeff().ptr(), n, stream_);
  details::accumulateOnDevice<oversampling, BaseClass::window_sampling_, RealInp, ScalarType>(
      M.ptr(), M.leadingDimension(), static_cast<ScalarType>(sign), accumulation_matrix_.ptr(),
      accumulation_matrix_sqr_.ptr(), accumulation_matrix_.leadingDimension(), config_left_dev_.ptr(),
      config_right_dev_.ptr(), times_dev_.ptr(), get_device_cubic_coeff().ptr(), n, stream_);

  m_copied_event_.record(stream_);
}
@@ -245,8 +245,8 @@ linalg::Vector<ScalarType, linalg::GPU>& Dnfft1DGpu<ScalarType, WDmn, RDmn, over
  return coefficients;
}

}  // nfft
}  // math
}  // dca
}  // namespace nfft
}  // namespace math
}  // namespace dca

#endif  // DCA_MATH_NFFT_DNFFT_1D_GPU_HPP
+13 −14
Original line number Diff line number Diff line
@@ -29,23 +29,22 @@ struct ConfigElem {
  int site;
};

template <typename ScalarType>
void accumulateOnDevice(const ScalarType* M, int ldm, ScalarType sign, ScalarType* out,
                        ScalarType* out_sqr, const int ldo, const ConfigElem* config_left,
                        const ConfigElem* config_right, const ScalarType* tau,
                        const ScalarType* cubic_coeff, int size, cudaStream_t stream_);
template <int oversampling, int window_sampling, typename ScalarIn, typename ScalarOut>
void accumulateOnDevice(const ScalarIn* M, const int ldm, const ScalarIn sign, ScalarOut* out,
                        ScalarOut* out_sqr, const int ldo, const ConfigElem* config_left,
                        const ConfigElem* config_right, const ScalarIn* tau,
                        const ScalarOut* cubic_coeff, const int size, cudaStream_t stream_);

template <typename ScalarType>
void sum(const ScalarType* in, int ldi, ScalarType* out, int ldo, int n, int m, cudaStream_t stream);

template <typename ScalarType>
void initializeNfftHelper(int nb, int nr, const int* sub_r, int lds, int oversampling,
                          int window_sampling, ScalarType t0, ScalarType delta_t,
                          ScalarType t0_window, ScalarType delta_t_window, ScalarType beta);

}  // details
}  // nfft
}  // math
}  // dca
void initializeNfftHelper(int nb, int nc, const int* add_r, int lda, const int* sub_r, int lds,
                          double t0, double delta_t, double t0_window, double delta_t_window,
                          double beta);

}  // namespace details
}  // namespace nfft
}  // namespace math
}  // namespace dca

#endif  // DCA_MATH_NFFT_KERNELS_INTERFACE_HPP
Loading