Merge pull request #165 from gbalduzz/optimize_sp_accum (1f9af577) · Commits · NDIP / Tool Sources / Direct-Geometry Spectroscopy / DCA / DCA Main

CMakeLists.txt

+0 −1

Original line number	Diff line number	Diff line
		@@ -151,7 +151,6 @@ set(DCA_LIBS
		if (DCA_HAVE_CUDA)
		list(APPEND DCA_LIBS
		blas_kernels
		dnfft_kernels
		lapack_kernels
		mc_kernels
		special_transform_kernels

cmake/dca_cuda.cmake

+2 −0

Original line number	Diff line number	Diff line
		@@ -20,6 +20,8 @@ if (CUDA_FOUND)
		list(APPEND DCA_CUDA_LIBS ${CUDA_LIBRARIES} ${CUDA_cusparse_LIBRARY} ${CUDA_cublas_LIBRARY})
		CUDA_INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS})
		set(CUDA_SEPARABLE_COMPILATION ON)
		list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr")
		set(CMAKE_CUDA_STANDARD 14)

		set(CVD_LAUNCHER "" CACHE INTERNAL "launch script for setting the Cuda visible devices.")
		# Use the following script for systems with multiple gpus visible from a rank.

include/dca/math/nfft/dnfft_1d.hpp

+8 −8

Original line number	Diff line number	Diff line
		@@ -275,10 +275,10 @@ void Dnfft1D<ScalarType, WDmn, PDmn, oversampling, mode>::convoluteToFTauExact(
		const ScalarType T_0 = PaddedTimeDmn::parameter_type::first_element();
		const ScalarType one_div_Delta = PaddedTimeDmn::parameter_type::get_one_div_Delta();

		int lambda_0 = (t_val - T_0) * one_div_Delta;
		const int lambda_0 = (t_val - T_0) * one_div_Delta;

		for (int l = -oversampling; l <= oversampling; ++l)
		f_tau_(lambda_0 + l, index) += f_val * WindowFunction::phi_t(tau(lambda_0 + l) - t_val);
		for (int l = lambda_0 - oversampling + 1; l <= lambda_0 + oversampling; ++l)
		f_tau_(l, index) += f_val * WindowFunction::phi_t(tau(l) - t_val);
		}

		template <typename ScalarType, typename WDmn, typename PDmn, int oversampling, NfftModeNames mode>
		@@ -316,7 +316,7 @@ inline void Dnfft1D<ScalarType, WDmn, PDmn, oversampling, mode>::convoluteToFTau
		ScalarType* f_tau_ptr = &f_tau_(tau_index, index);
		ScalarType* matrix_ptr = &get_linear_convolution_matrices()(0, i, j);

		nfft_atomic_convolution<2 * oversampling + 1, 0>::execute_linear(f_tau_ptr, matrix_ptr, y_ptr);
		NfftAtomicConvolution<oversampling>::execute_linear(f_tau_ptr, matrix_ptr, y_ptr);
		}

		template <typename ScalarType, typename WDmn, typename PDmn, int oversampling, NfftModeNames mode>
		@@ -364,7 +364,7 @@ inline void Dnfft1D<ScalarType, WDmn, PDmn, oversampling, mode>::convoluteToFTau
		ScalarType* f_tau_ptr = &f_tau_(tau_index, index);
		ScalarType* matrix_ptr = &get_cubic_convolution_matrices()(0, i, j);

		nfft_atomic_convolution<2 * oversampling + 1, 0>::execute_cubic(f_tau_ptr, matrix_ptr, y_ptr);
		NfftAtomicConvolution<oversampling>::execute_cubic(f_tau_ptr, matrix_ptr, y_ptr);
		}

		template <typename ScalarType, typename WDmn, typename PDmn, int oversampling, NfftModeNames mode>
		@@ -481,8 +481,8 @@ auto& Dnfft1D<ScalarType, WDmn, PDmn, oversampling, mode>::get_phi_wn() {
		return phi_wn;
		}

		} // nfft
		} // math
		} // dca
		} // namespace nfft
		} // namespace math
		} // namespace dca

		#endif // DCA_MATH_NFFT_DNFFT_1D_HPP

include/dca/math/nfft/dnfft_1d_gpu.hpp

+13 −13

Original line number	Diff line number	Diff line
		@@ -145,13 +145,14 @@ void Dnfft1DGpu<ScalarType, WDmn, RDmn, oversampling, CUBIC>::initializeDeviceCo
		cudaMemcpyHostToDevice);

		const auto& sub_matrix = RDmn::parameter_type::get_subtract_matrix();
		const auto& add_matrix = RDmn::parameter_type::get_add_matrix();
		using PaddedTimeDmn = typename BaseClass::PaddedTimeDmn::parameter_type;
		using WindowTimeDmn = typename BaseClass::WindowFunctionTimeDmn::parameter_type;
		details::initializeNfftHelper<ScalarType>(
		BDmn::dmn_size(), RDmn::dmn_size(), sub_matrix.ptr(), sub_matrix.leadingDimension(),
		oversampling, BaseClass::get_window_sampling(), PaddedTimeDmn::first_element(),
		PaddedTimeDmn::get_Delta(), WindowTimeDmn::first_element(), WindowTimeDmn::get_delta(),
		beta_);
		details::initializeNfftHelper(BDmn::dmn_size(), RDmn::dmn_size(), add_matrix.ptr(),
		add_matrix.leadingDimension(), sub_matrix.ptr(),
		sub_matrix.leadingDimension(), PaddedTimeDmn::first_element(),
		PaddedTimeDmn::get_Delta(), WindowTimeDmn::first_element(),
		WindowTimeDmn::get_delta(), beta_);

		assert(cudaPeekAtLastError() == cudaSuccess);
		});
		@@ -190,11 +191,10 @@ void Dnfft1DGpu<ScalarType, WDmn, RDmn, oversampling, CUBIC>::accumulate(
		config_left_dev_.setAsync(config_left_, stream_);
		times_dev_.setAsync(times_, stream_);

		details::accumulateOnDevice(M.ptr(), M.leadingDimension(), static_cast<ScalarType>(sign),
		accumulation_matrix_.ptr(), accumulation_matrix_sqr_.ptr(),
		accumulation_matrix_.leadingDimension(), config_left_dev_.ptr(),
		config_right_dev_.ptr(), times_dev_.ptr(),
		get_device_cubic_coeff().ptr(), n, stream_);
		details::accumulateOnDevice<oversampling, BaseClass::window_sampling_, RealInp, ScalarType>(
		M.ptr(), M.leadingDimension(), static_cast<ScalarType>(sign), accumulation_matrix_.ptr(),
		accumulation_matrix_sqr_.ptr(), accumulation_matrix_.leadingDimension(), config_left_dev_.ptr(),
		config_right_dev_.ptr(), times_dev_.ptr(), get_device_cubic_coeff().ptr(), n, stream_);

		m_copied_event_.record(stream_);
		}
		@@ -245,8 +245,8 @@ linalg::Vector<ScalarType, linalg::GPU>& Dnfft1DGpu<ScalarType, WDmn, RDmn, over
		return coefficients;
		}

		} // nfft
		} // math
		} // dca
		} // namespace nfft
		} // namespace math
		} // namespace dca

		#endif // DCA_MATH_NFFT_DNFFT_1D_GPU_HPP

include/dca/math/nfft/kernels_interface.hpp

+13 −14

Original line number	Diff line number	Diff line
		@@ -29,23 +29,22 @@ struct ConfigElem {
		int site;
		};

		template <typename ScalarType>
		void accumulateOnDevice(const ScalarType* M, int ldm, ScalarType sign, ScalarType* out,
		ScalarType* out_sqr, const int ldo, const ConfigElem* config_left,
		const ConfigElem* config_right, const ScalarType* tau,
		const ScalarType* cubic_coeff, int size, cudaStream_t stream_);
		template <int oversampling, int window_sampling, typename ScalarIn, typename ScalarOut>
		void accumulateOnDevice(const ScalarIn* M, const int ldm, const ScalarIn sign, ScalarOut* out,
		ScalarOut* out_sqr, const int ldo, const ConfigElem* config_left,
		const ConfigElem* config_right, const ScalarIn* tau,
		const ScalarOut* cubic_coeff, const int size, cudaStream_t stream_);

		template <typename ScalarType>
		void sum(const ScalarType* in, int ldi, ScalarType* out, int ldo, int n, int m, cudaStream_t stream);

		template <typename ScalarType>
		void initializeNfftHelper(int nb, int nr, const int* sub_r, int lds, int oversampling,
		int window_sampling, ScalarType t0, ScalarType delta_t,
		ScalarType t0_window, ScalarType delta_t_window, ScalarType beta);

		} // details
		} // nfft
		} // math
		} // dca
		void initializeNfftHelper(int nb, int nc, const int* add_r, int lda, const int* sub_r, int lds,
		double t0, double delta_t, double t0_window, double delta_t_window,
		double beta);

		} // namespace details
		} // namespace nfft
		} // namespace math
		} // namespace dca

		#endif // DCA_MATH_NFFT_KERNELS_INTERFACE_HPP