Loading CMakeLists.txt +0 −1 Original line number Diff line number Diff line Loading @@ -151,7 +151,6 @@ set(DCA_LIBS if (DCA_HAVE_CUDA) list(APPEND DCA_LIBS blas_kernels dnfft_kernels lapack_kernels mc_kernels special_transform_kernels Loading cmake/dca_cuda.cmake +2 −0 Original line number Diff line number Diff line Loading @@ -20,6 +20,8 @@ if (CUDA_FOUND) list(APPEND DCA_CUDA_LIBS ${CUDA_LIBRARIES} ${CUDA_cusparse_LIBRARY} ${CUDA_cublas_LIBRARY}) CUDA_INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS}) set(CUDA_SEPARABLE_COMPILATION ON) list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr") set(CMAKE_CUDA_STANDARD 14) set(CVD_LAUNCHER "" CACHE INTERNAL "launch script for setting the Cuda visible devices.") # Use the following script for systems with multiple gpus visible from a rank. Loading include/dca/math/nfft/dnfft_1d.hpp +8 −8 Original line number Diff line number Diff line Loading @@ -275,10 +275,10 @@ void Dnfft1D<ScalarType, WDmn, PDmn, oversampling, mode>::convoluteToFTauExact( const ScalarType T_0 = PaddedTimeDmn::parameter_type::first_element(); const ScalarType one_div_Delta = PaddedTimeDmn::parameter_type::get_one_div_Delta(); int lambda_0 = (t_val - T_0) * one_div_Delta; const int lambda_0 = (t_val - T_0) * one_div_Delta; for (int l = -oversampling; l <= oversampling; ++l) f_tau_(lambda_0 + l, index) += f_val * WindowFunction::phi_t(tau(lambda_0 + l) - t_val); for (int l = lambda_0 - oversampling + 1; l <= lambda_0 + oversampling; ++l) f_tau_(l, index) += f_val * WindowFunction::phi_t(tau(l) - t_val); } template <typename ScalarType, typename WDmn, typename PDmn, int oversampling, NfftModeNames mode> Loading Loading @@ -316,7 +316,7 @@ inline void Dnfft1D<ScalarType, WDmn, PDmn, oversampling, mode>::convoluteToFTau ScalarType* f_tau_ptr = &f_tau_(tau_index, index); ScalarType* matrix_ptr = &get_linear_convolution_matrices()(0, i, j); nfft_atomic_convolution<2 * oversampling + 1, 0>::execute_linear(f_tau_ptr, matrix_ptr, y_ptr); NfftAtomicConvolution<oversampling>::execute_linear(f_tau_ptr, matrix_ptr, y_ptr); } template <typename ScalarType, typename WDmn, typename PDmn, int oversampling, NfftModeNames mode> Loading Loading @@ -364,7 +364,7 @@ inline void Dnfft1D<ScalarType, WDmn, PDmn, oversampling, mode>::convoluteToFTau ScalarType* f_tau_ptr = &f_tau_(tau_index, index); ScalarType* matrix_ptr = &get_cubic_convolution_matrices()(0, i, j); nfft_atomic_convolution<2 * oversampling + 1, 0>::execute_cubic(f_tau_ptr, matrix_ptr, y_ptr); NfftAtomicConvolution<oversampling>::execute_cubic(f_tau_ptr, matrix_ptr, y_ptr); } template <typename ScalarType, typename WDmn, typename PDmn, int oversampling, NfftModeNames mode> Loading Loading @@ -481,8 +481,8 @@ auto& Dnfft1D<ScalarType, WDmn, PDmn, oversampling, mode>::get_phi_wn() { return phi_wn; } } // nfft } // math } // dca } // namespace nfft } // namespace math } // namespace dca #endif // DCA_MATH_NFFT_DNFFT_1D_HPP include/dca/math/nfft/dnfft_1d_gpu.hpp +13 −13 Original line number Diff line number Diff line Loading @@ -145,13 +145,14 @@ void Dnfft1DGpu<ScalarType, WDmn, RDmn, oversampling, CUBIC>::initializeDeviceCo cudaMemcpyHostToDevice); const auto& sub_matrix = RDmn::parameter_type::get_subtract_matrix(); const auto& add_matrix = RDmn::parameter_type::get_add_matrix(); using PaddedTimeDmn = typename BaseClass::PaddedTimeDmn::parameter_type; using WindowTimeDmn = typename BaseClass::WindowFunctionTimeDmn::parameter_type; details::initializeNfftHelper<ScalarType>( BDmn::dmn_size(), RDmn::dmn_size(), sub_matrix.ptr(), sub_matrix.leadingDimension(), oversampling, BaseClass::get_window_sampling(), PaddedTimeDmn::first_element(), PaddedTimeDmn::get_Delta(), WindowTimeDmn::first_element(), WindowTimeDmn::get_delta(), beta_); details::initializeNfftHelper(BDmn::dmn_size(), RDmn::dmn_size(), add_matrix.ptr(), add_matrix.leadingDimension(), sub_matrix.ptr(), sub_matrix.leadingDimension(), PaddedTimeDmn::first_element(), PaddedTimeDmn::get_Delta(), WindowTimeDmn::first_element(), WindowTimeDmn::get_delta(), beta_); assert(cudaPeekAtLastError() == cudaSuccess); }); Loading Loading @@ -190,11 +191,10 @@ void Dnfft1DGpu<ScalarType, WDmn, RDmn, oversampling, CUBIC>::accumulate( config_left_dev_.setAsync(config_left_, stream_); times_dev_.setAsync(times_, stream_); details::accumulateOnDevice(M.ptr(), M.leadingDimension(), static_cast<ScalarType>(sign), accumulation_matrix_.ptr(), accumulation_matrix_sqr_.ptr(), accumulation_matrix_.leadingDimension(), config_left_dev_.ptr(), config_right_dev_.ptr(), times_dev_.ptr(), get_device_cubic_coeff().ptr(), n, stream_); details::accumulateOnDevice<oversampling, BaseClass::window_sampling_, RealInp, ScalarType>( M.ptr(), M.leadingDimension(), static_cast<ScalarType>(sign), accumulation_matrix_.ptr(), accumulation_matrix_sqr_.ptr(), accumulation_matrix_.leadingDimension(), config_left_dev_.ptr(), config_right_dev_.ptr(), times_dev_.ptr(), get_device_cubic_coeff().ptr(), n, stream_); m_copied_event_.record(stream_); } Loading Loading @@ -245,8 +245,8 @@ linalg::Vector<ScalarType, linalg::GPU>& Dnfft1DGpu<ScalarType, WDmn, RDmn, over return coefficients; } } // nfft } // math } // dca } // namespace nfft } // namespace math } // namespace dca #endif // DCA_MATH_NFFT_DNFFT_1D_GPU_HPP include/dca/math/nfft/kernels_interface.hpp +13 −14 Original line number Diff line number Diff line Loading @@ -29,23 +29,22 @@ struct ConfigElem { int site; }; template <typename ScalarType> void accumulateOnDevice(const ScalarType* M, int ldm, ScalarType sign, ScalarType* out, ScalarType* out_sqr, const int ldo, const ConfigElem* config_left, const ConfigElem* config_right, const ScalarType* tau, const ScalarType* cubic_coeff, int size, cudaStream_t stream_); template <int oversampling, int window_sampling, typename ScalarIn, typename ScalarOut> void accumulateOnDevice(const ScalarIn* M, const int ldm, const ScalarIn sign, ScalarOut* out, ScalarOut* out_sqr, const int ldo, const ConfigElem* config_left, const ConfigElem* config_right, const ScalarIn* tau, const ScalarOut* cubic_coeff, const int size, cudaStream_t stream_); template <typename ScalarType> void sum(const ScalarType* in, int ldi, ScalarType* out, int ldo, int n, int m, cudaStream_t stream); template <typename ScalarType> void initializeNfftHelper(int nb, int nr, const int* sub_r, int lds, int oversampling, int window_sampling, ScalarType t0, ScalarType delta_t, ScalarType t0_window, ScalarType delta_t_window, ScalarType beta); } // details } // nfft } // math } // dca void initializeNfftHelper(int nb, int nc, const int* add_r, int lda, const int* sub_r, int lds, double t0, double delta_t, double t0_window, double delta_t_window, double beta); } // namespace details } // namespace nfft } // namespace math } // namespace dca #endif // DCA_MATH_NFFT_KERNELS_INTERFACE_HPP Loading
CMakeLists.txt +0 −1 Original line number Diff line number Diff line Loading @@ -151,7 +151,6 @@ set(DCA_LIBS if (DCA_HAVE_CUDA) list(APPEND DCA_LIBS blas_kernels dnfft_kernels lapack_kernels mc_kernels special_transform_kernels Loading
cmake/dca_cuda.cmake +2 −0 Original line number Diff line number Diff line Loading @@ -20,6 +20,8 @@ if (CUDA_FOUND) list(APPEND DCA_CUDA_LIBS ${CUDA_LIBRARIES} ${CUDA_cusparse_LIBRARY} ${CUDA_cublas_LIBRARY}) CUDA_INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS}) set(CUDA_SEPARABLE_COMPILATION ON) list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr") set(CMAKE_CUDA_STANDARD 14) set(CVD_LAUNCHER "" CACHE INTERNAL "launch script for setting the Cuda visible devices.") # Use the following script for systems with multiple gpus visible from a rank. Loading
include/dca/math/nfft/dnfft_1d.hpp +8 −8 Original line number Diff line number Diff line Loading @@ -275,10 +275,10 @@ void Dnfft1D<ScalarType, WDmn, PDmn, oversampling, mode>::convoluteToFTauExact( const ScalarType T_0 = PaddedTimeDmn::parameter_type::first_element(); const ScalarType one_div_Delta = PaddedTimeDmn::parameter_type::get_one_div_Delta(); int lambda_0 = (t_val - T_0) * one_div_Delta; const int lambda_0 = (t_val - T_0) * one_div_Delta; for (int l = -oversampling; l <= oversampling; ++l) f_tau_(lambda_0 + l, index) += f_val * WindowFunction::phi_t(tau(lambda_0 + l) - t_val); for (int l = lambda_0 - oversampling + 1; l <= lambda_0 + oversampling; ++l) f_tau_(l, index) += f_val * WindowFunction::phi_t(tau(l) - t_val); } template <typename ScalarType, typename WDmn, typename PDmn, int oversampling, NfftModeNames mode> Loading Loading @@ -316,7 +316,7 @@ inline void Dnfft1D<ScalarType, WDmn, PDmn, oversampling, mode>::convoluteToFTau ScalarType* f_tau_ptr = &f_tau_(tau_index, index); ScalarType* matrix_ptr = &get_linear_convolution_matrices()(0, i, j); nfft_atomic_convolution<2 * oversampling + 1, 0>::execute_linear(f_tau_ptr, matrix_ptr, y_ptr); NfftAtomicConvolution<oversampling>::execute_linear(f_tau_ptr, matrix_ptr, y_ptr); } template <typename ScalarType, typename WDmn, typename PDmn, int oversampling, NfftModeNames mode> Loading Loading @@ -364,7 +364,7 @@ inline void Dnfft1D<ScalarType, WDmn, PDmn, oversampling, mode>::convoluteToFTau ScalarType* f_tau_ptr = &f_tau_(tau_index, index); ScalarType* matrix_ptr = &get_cubic_convolution_matrices()(0, i, j); nfft_atomic_convolution<2 * oversampling + 1, 0>::execute_cubic(f_tau_ptr, matrix_ptr, y_ptr); NfftAtomicConvolution<oversampling>::execute_cubic(f_tau_ptr, matrix_ptr, y_ptr); } template <typename ScalarType, typename WDmn, typename PDmn, int oversampling, NfftModeNames mode> Loading Loading @@ -481,8 +481,8 @@ auto& Dnfft1D<ScalarType, WDmn, PDmn, oversampling, mode>::get_phi_wn() { return phi_wn; } } // nfft } // math } // dca } // namespace nfft } // namespace math } // namespace dca #endif // DCA_MATH_NFFT_DNFFT_1D_HPP
include/dca/math/nfft/dnfft_1d_gpu.hpp +13 −13 Original line number Diff line number Diff line Loading @@ -145,13 +145,14 @@ void Dnfft1DGpu<ScalarType, WDmn, RDmn, oversampling, CUBIC>::initializeDeviceCo cudaMemcpyHostToDevice); const auto& sub_matrix = RDmn::parameter_type::get_subtract_matrix(); const auto& add_matrix = RDmn::parameter_type::get_add_matrix(); using PaddedTimeDmn = typename BaseClass::PaddedTimeDmn::parameter_type; using WindowTimeDmn = typename BaseClass::WindowFunctionTimeDmn::parameter_type; details::initializeNfftHelper<ScalarType>( BDmn::dmn_size(), RDmn::dmn_size(), sub_matrix.ptr(), sub_matrix.leadingDimension(), oversampling, BaseClass::get_window_sampling(), PaddedTimeDmn::first_element(), PaddedTimeDmn::get_Delta(), WindowTimeDmn::first_element(), WindowTimeDmn::get_delta(), beta_); details::initializeNfftHelper(BDmn::dmn_size(), RDmn::dmn_size(), add_matrix.ptr(), add_matrix.leadingDimension(), sub_matrix.ptr(), sub_matrix.leadingDimension(), PaddedTimeDmn::first_element(), PaddedTimeDmn::get_Delta(), WindowTimeDmn::first_element(), WindowTimeDmn::get_delta(), beta_); assert(cudaPeekAtLastError() == cudaSuccess); }); Loading Loading @@ -190,11 +191,10 @@ void Dnfft1DGpu<ScalarType, WDmn, RDmn, oversampling, CUBIC>::accumulate( config_left_dev_.setAsync(config_left_, stream_); times_dev_.setAsync(times_, stream_); details::accumulateOnDevice(M.ptr(), M.leadingDimension(), static_cast<ScalarType>(sign), accumulation_matrix_.ptr(), accumulation_matrix_sqr_.ptr(), accumulation_matrix_.leadingDimension(), config_left_dev_.ptr(), config_right_dev_.ptr(), times_dev_.ptr(), get_device_cubic_coeff().ptr(), n, stream_); details::accumulateOnDevice<oversampling, BaseClass::window_sampling_, RealInp, ScalarType>( M.ptr(), M.leadingDimension(), static_cast<ScalarType>(sign), accumulation_matrix_.ptr(), accumulation_matrix_sqr_.ptr(), accumulation_matrix_.leadingDimension(), config_left_dev_.ptr(), config_right_dev_.ptr(), times_dev_.ptr(), get_device_cubic_coeff().ptr(), n, stream_); m_copied_event_.record(stream_); } Loading Loading @@ -245,8 +245,8 @@ linalg::Vector<ScalarType, linalg::GPU>& Dnfft1DGpu<ScalarType, WDmn, RDmn, over return coefficients; } } // nfft } // math } // dca } // namespace nfft } // namespace math } // namespace dca #endif // DCA_MATH_NFFT_DNFFT_1D_GPU_HPP
include/dca/math/nfft/kernels_interface.hpp +13 −14 Original line number Diff line number Diff line Loading @@ -29,23 +29,22 @@ struct ConfigElem { int site; }; template <typename ScalarType> void accumulateOnDevice(const ScalarType* M, int ldm, ScalarType sign, ScalarType* out, ScalarType* out_sqr, const int ldo, const ConfigElem* config_left, const ConfigElem* config_right, const ScalarType* tau, const ScalarType* cubic_coeff, int size, cudaStream_t stream_); template <int oversampling, int window_sampling, typename ScalarIn, typename ScalarOut> void accumulateOnDevice(const ScalarIn* M, const int ldm, const ScalarIn sign, ScalarOut* out, ScalarOut* out_sqr, const int ldo, const ConfigElem* config_left, const ConfigElem* config_right, const ScalarIn* tau, const ScalarOut* cubic_coeff, const int size, cudaStream_t stream_); template <typename ScalarType> void sum(const ScalarType* in, int ldi, ScalarType* out, int ldo, int n, int m, cudaStream_t stream); template <typename ScalarType> void initializeNfftHelper(int nb, int nr, const int* sub_r, int lds, int oversampling, int window_sampling, ScalarType t0, ScalarType delta_t, ScalarType t0_window, ScalarType delta_t_window, ScalarType beta); } // details } // nfft } // math } // dca void initializeNfftHelper(int nb, int nc, const int* add_r, int lda, const int* sub_r, int lds, double t0, double delta_t, double t0_window, double delta_t_window, double beta); } // namespace details } // namespace nfft } // namespace math } // namespace dca #endif // DCA_MATH_NFFT_KERNELS_INTERFACE_HPP