Use __costant__ nfft helper. (cc5f8e44) · Commits · NDIP / Tool Sources / Direct-Geometry Spectroscopy / DCA / DCA Main

CMakeLists.txt

+0 −1

Original line number	Diff line number	Diff line
		@@ -151,7 +151,6 @@ if (DCA_HAVE_CUDA)
		cuda_utils)
		list(APPEND DCA_LIBS
		blas_kernels
		dnfft_kernels
		lapack_kernels
		mc_kernels
		special_transform_kernels

cmake/dca_cuda.cmake

+6 −1

Original line number	Diff line number	Diff line
		@@ -18,8 +18,13 @@ if (CUDA_FOUND)
		# set(DCA_HAVE_CUDA TRUE CACHE INTERNAL "")
		# dca_add_haves_define(DCA_HAVE_CUDA)
		list(APPEND DCA_CUDA_LIBS ${CUDA_LIBRARIES} ${CUDA_cusparse_LIBRARY} ${CUDA_cublas_LIBRARY})
		CUDA_INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS})
		set(CUDA_SEPARABLE_COMPILATION ON)
		list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr")
		# if(${CMAKE_BUILD_TYPE} STREQUAL "Debug")
		# list(APPEND CUDA_NVCC_FLAGS "-G")
		# endif()
		# list(APPEND CUDA_NVCC_FLAGS_RELEASE "--use_fast_math")
		set(CMAKE_CUDA_STANDARD 14)
		endif()

		# Find MAGMA.

include/dca/math/nfft/dnfft_1d_gpu.hpp

+13 −14

Original line number	Diff line number	Diff line
		@@ -139,7 +139,6 @@ template <typename ScalarType, typename WDmn, typename RDmn, int oversampling>
		void Dnfft1DGpu<ScalarType, WDmn, RDmn, oversampling, CUBIC>::initializeDeviceCoefficients() {
		static std::once_flag flag;
		std::call_once(flag, [&]() {

		const auto& host_coeff = BaseClass::get_cubic_convolution_matrices();
		auto& dev_coeff = get_device_cubic_coeff();
		dev_coeff.resizeNoCopy(host_coeff.size());
		@@ -147,13 +146,14 @@ void Dnfft1DGpu<ScalarType, WDmn, RDmn, oversampling, CUBIC>::initializeDeviceCo
		cudaMemcpyHostToDevice);

		const auto& sub_matrix = RDmn::parameter_type::get_subtract_matrix();
		const auto& add_matrix = RDmn::parameter_type::get_add_matrix();
		using PaddedTimeDmn = typename BaseClass::PaddedTimeDmn::parameter_type;
		using WindowTimeDmn = typename BaseClass::WindowFunctionTimeDmn::parameter_type;
		details::initializeNfftHelper<ScalarType>(
		BDmn::dmn_size(), RDmn::dmn_size(), sub_matrix.ptr(), sub_matrix.leadingDimension(),
		oversampling, BaseClass::get_window_sampling(), PaddedTimeDmn::first_element(),
		PaddedTimeDmn::get_Delta(), WindowTimeDmn::first_element(), WindowTimeDmn::get_delta(),
		beta_);
		details::initializeNfftHelper(BDmn::dmn_size(), RDmn::dmn_size(), add_matrix.ptr(),
		add_matrix.leadingDimension(), sub_matrix.ptr(),
		sub_matrix.leadingDimension(), PaddedTimeDmn::first_element(),
		PaddedTimeDmn::get_Delta(), WindowTimeDmn::first_element(),
		WindowTimeDmn::get_delta(), beta_);

		assert(cudaPeekAtLastError() == cudaSuccess);
		});
		@@ -192,11 +192,10 @@ void Dnfft1DGpu<ScalarType, WDmn, RDmn, oversampling, CUBIC>::accumulate(
		config_left_dev_.setAsync(config_left_, stream_);
		times_dev_.setAsync(times_, stream_);

		details::accumulateOnDevice(M.ptr(), M.leadingDimension(), static_cast<ScalarType>(sign),
		accumulation_matrix_.ptr(), accumulation_matrix_sqr_.ptr(),
		accumulation_matrix_.leadingDimension(), config_left_dev_.ptr(),
		config_right_dev_.ptr(), times_dev_.ptr(),
		get_device_cubic_coeff().ptr(), n, stream_);
		details::accumulateOnDevice<oversampling, BaseClass::window_sampling_, RealInp, ScalarType>(
		M.ptr(), M.leadingDimension(), static_cast<ScalarType>(sign), accumulation_matrix_.ptr(),
		accumulation_matrix_sqr_.ptr(), accumulation_matrix_.leadingDimension(), config_left_dev_.ptr(),
		config_right_dev_.ptr(), times_dev_.ptr(), get_device_cubic_coeff().ptr(), n, stream_);

		m_copied_event_.record(stream_);
		}
		@@ -247,8 +246,8 @@ linalg::Vector<ScalarType, linalg::GPU>& Dnfft1DGpu<ScalarType, WDmn, RDmn, over
		return coefficients;
		}

		} // nfft
		} // math
		} // dca
		} // namespace nfft
		} // namespace math
		} // namespace dca

		#endif // DCA_MATH_NFFT_DNFFT_1D_GPU_HPP

include/dca/math/nfft/kernels_interface.hpp

+13 −14

Original line number	Diff line number	Diff line
		@@ -29,23 +29,22 @@ struct ConfigElem {
		int site;
		};

		template <typename ScalarType>
		void accumulateOnDevice(const ScalarType* M, int ldm, ScalarType sign, ScalarType* out,
		ScalarType* out_sqr, const int ldo, const ConfigElem* config_left,
		const ConfigElem* config_right, const ScalarType* tau,
		const ScalarType* cubic_coeff, int size, cudaStream_t stream_);
		template <int oversampling, int window_sampling, typename ScalarIn, typename ScalarOut>
		void accumulateOnDevice(const ScalarIn* M, const int ldm, const ScalarIn sign, ScalarOut* out,
		ScalarOut* out_sqr, const int ldo, const ConfigElem* config_left,
		const ConfigElem* config_right, const ScalarIn* tau,
		const ScalarOut* cubic_coeff, const int size, cudaStream_t stream_);

		template <typename ScalarType>
		void sum(const ScalarType* in, int ldi, ScalarType* out, int ldo, int n, int m, cudaStream_t stream);

		template <typename ScalarType>
		void initializeNfftHelper(int nb, int nr, const int* sub_r, int lds, int oversampling,
		int window_sampling, ScalarType t0, ScalarType delta_t,
		ScalarType t0_window, ScalarType delta_t_window, ScalarType beta);

		} // details
		} // nfft
		} // math
		} // dca
		void initializeNfftHelper(int nb, int nc, const int* add_r, int lda, const int* sub_r, int lds,
		double t0, double delta_t, double t0_window, double delta_t_window,
		double beta);

		} // namespace details
		} // namespace nfft
		} // namespace math
		} // namespace dca

		#endif // DCA_MATH_NFFT_KERNELS_INTERFACE_HPP

include/dca/math/nfft/nfft_helper.cu.hpp

deleted100644 → 0

+0 −191

Original line number	Diff line number	Diff line
		// Copyright (C) 2018 ETH Zurich
		// Copyright (C) 2018 UT-Battelle, LLC
		// All rights reserved.
		//
		// See LICENSE.txt for terms of usage.
		// See CITATION.txt for citation guidelines if you use this code for scientific publications.
		//
		// Author: Giovanni Balduzzi (gbalduzz@itp.phys.ethz.ch)
		//
		// Helper class for computing interpolation indices during the single particle accumulation.

		#ifndef DCA_INCLUDE_DCA_MATH_NFFT_NFFT_HELPER
		#define DCA_INCLUDE_DCA_MATH_NFFT_NFFT_HELPER

		#include <array>
		#include <cassert>
		#include <cuda.h>
		#include <stdexcept>

		#include "dca/math/nfft/nfft_mode_names.hpp"

		namespace dca {
		namespace math {
		namespace nfft {
		namespace details {
		// dca::math::nfft::details::

		template <typename ScalarType>
		class NfftHelper {
		public:
		__host__ void set(int nb, int nr, const int* sub_r, int lds, int oversampling,
		int window_sampling, ScalarType t0, ScalarType delta_t, ScalarType t0_window,
		ScalarType delta_t_window, ScalarType beta);

		NfftHelper(const NfftHelper& other) = default;

		__host__ bool isInitialized() const {
		return parameters_int_ != nullptr;
		}

		__device__ int inline computeLinearIndex(int b1, int b2, int r1, int r2) const;

		__device__ ScalarType inline computeTau(ScalarType t1, ScalarType t2) const;

		__device__ inline int deltaR(int r1, int r2) const;

		template <NfftModeNames mode>
		__device__ inline void computeInterpolationIndices(ScalarType t, int& t_idx, int& conv_coeff_idx,
		ScalarType& delta_t) const;

		__device__ inline int get_oversampling() const {
		return parameters_int_[3];
		}

		protected:
		// This object can be constructed only through its derived class.
		NfftHelper() = default;

		int* sub_matrix_ = nullptr;

		// Stores in this order { nb, nr, lds, oversampling, window_sampling}
		int* parameters_int_ = nullptr;
		// Stores in this order { t0, delta_t, t0_window, delta_t_window, one_div_delta,
		// one_div_delta_window, tau_scale}
		ScalarType* parameters_real_ = nullptr;
		};

		template <typename ScalarType>
		class NfftHelperManager : public NfftHelper<ScalarType> {
		public:
		NfftHelperManager() = default;

		__host__ ~NfftHelperManager();

		__host__ __device__ NfftHelperManager(const NfftHelperManager& other) = delete;

		__host__ void set(int nb, int nr, const int* sub_r, int lds, int oversampling,
		int window_sampling, ScalarType t0, ScalarType delta_t, ScalarType t0_window,
		ScalarType delta_t_window, ScalarType beta);

		int get_oversampling() const {
		assert(NfftHelper<ScalarType>::isInitialized());
		return oversampling_;
		}

		private:
		int oversampling_ = -1;
		};

		template <typename ScalarType>
		__host__ void NfftHelper<ScalarType>::set(const int nb, const int nr, const int* sub_r, int lds,
		int oversampling, int window_sampling, ScalarType t0,
		ScalarType delta_t, const ScalarType t0_window,
		const ScalarType delta_t_window, const ScalarType beta) {
		if (isInitialized())
		throw(std::logic_error("already initialized."));

		cudaMalloc(&sub_matrix_, sizeof(int) * lds * nr);
		cudaMemcpy(sub_matrix_, sub_r, sizeof(int) * lds * nr, cudaMemcpyHostToDevice);

		const std::array<int, 5> parameters_int_host{nb, nr, lds, oversampling, window_sampling};
		cudaMalloc(&parameters_int_, sizeof(int) * parameters_int_host.size());
		cudaMemcpy(parameters_int_, parameters_int_host.data(), sizeof(int) * parameters_int_host.size(),
		cudaMemcpyHostToDevice);

		ScalarType one(1);
		const ScalarType tau_scale = ScalarType(0.5) / beta;
		const std::array<ScalarType, 7> parameters_real_host{
		t0, delta_t, t0_window, delta_t_window, one / delta_t, one / delta_t_window, tau_scale};
		cudaMalloc(&parameters_real_, sizeof(ScalarType) * parameters_real_host.size());
		cudaMemcpy(parameters_real_, parameters_real_host.data(),
		sizeof(ScalarType) * parameters_real_host.size(), cudaMemcpyHostToDevice);
		}

		template <typename ScalarType>
		__device__ int NfftHelper<ScalarType>::deltaR(const int r1, const int r2) const {
		const int ld = parameters_int_[2];
		return sub_matrix_[r1 + ld * r2];
		}

		template <typename ScalarType>
		__host__ void NfftHelperManager<ScalarType>::set(const int nb, const int nr, const int* sub_r,
		int lds, int oversampling, int window_sampling,
		ScalarType t0, ScalarType delta_t,
		const ScalarType t0_window,
		const ScalarType delta_t_window,
		const ScalarType beta) {
		NfftHelper<ScalarType>::set(nb, nr, sub_r, lds, oversampling, window_sampling, t0, delta_t,
		t0_window, delta_t_window, beta);
		oversampling_ = oversampling;
		}

		template <typename ScalarType>
		__device__ int inline NfftHelper<ScalarType>::computeLinearIndex(const int b1, const int b2,
		const int r1, const int r2) const {
		const int delta_r = deltaR(r2, r1);
		const int nb = parameters_int_[0];
		return b1 + nb * (b2 + nb * delta_r);
		}

		template <typename ScalarType>
		__device__ ScalarType NfftHelper<ScalarType>::computeTau(ScalarType t1, ScalarType t2) const {
		const ScalarType tau_scale = parameters_real_[6];
		return (t1 - t2) * tau_scale;
		}

		template <typename ScalarType>
		template <NfftModeNames mode>
		__device__ void NfftHelper<ScalarType>::computeInterpolationIndices(ScalarType t, int& t_idx,
		int& conv_coeff_idx,
		ScalarType& delta_t) const {
		static_assert(mode == CUBIC \|\| mode == LINEAR,
		"This method is defined only for linear or cubic interpolations.");

		const ScalarType t0 = parameters_real_[0];
		const ScalarType delta_t_padded = parameters_real_[1];
		const ScalarType t0_window = parameters_real_[2];
		const ScalarType delta_t_window = parameters_real_[3];
		auto get_tau = [=](int index) { return t0 + index * delta_t_padded; };
		auto get_window_tau = [&](int index) { return t0_window + index * delta_t_window; };
		const ScalarType one_div_delta = parameters_real_[4];
		const ScalarType one_div_delta_wnd = parameters_real_[5];

		t_idx = (t - t0) * one_div_delta;
		const int t_wnd_idx = (get_tau(t_idx) - t - t0_window) * one_div_delta_wnd;
		delta_t = get_tau(t_idx) - t - get_window_tau(t_wnd_idx);

		const int oversampling = parameters_int_[3];
		const int window_sampling = parameters_int_[4];
		t_idx -= oversampling;
		const int delta_tau_index = t_wnd_idx - oversampling * window_sampling;
		const int j = delta_tau_index % window_sampling;
		const int i = (delta_tau_index - j) / window_sampling;

		constexpr int interpolation_size = mode == CUBIC ? 4 : 2;
		conv_coeff_idx = interpolation_size * i + interpolation_size * 4 * oversampling * j;
		}

		template <typename ScalarType>
		__host__ NfftHelperManager<ScalarType>::~NfftHelperManager() {
		cudaFree(NfftHelper<ScalarType>::sub_matrix_);
		cudaFree(NfftHelper<ScalarType>::parameters_int_);
		cudaFree(NfftHelper<ScalarType>::parameters_real_);
		}

		} // details
		} // nfft
		} // math
		} // dca

		#endif // DCA_INCLUDE_DCA_MATH_NFFT_NFFT_HELPER