Merge pull request #83 from CompFUSE/minor_g4_fixes (aefe49e4) · Commits · NDIP / Tool Sources / Direct-Geometry Spectroscopy / DCA / DCA Main

include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator.hpp

+17 −15

Original line number	Diff line number	Diff line
		@@ -383,10 +383,12 @@ double TpAccumulator<Parameters, linalg::CPU>::updateG4() {

		switch (mode_) {
		case PARTICLE_HOLE_MAGNETIC:
		// Note: sums over spin indices are implied.
		//
		// G4(k1, k2, k_ex) = 1/2 (s1 * s2) <c^+(k1 + k_ex, s1) c(k1, s1)
		// c^+(k2, s2) c(k2 + k_ex, s2)>
		// = 1/2 (s1 * s2) <G(k1, k1 + k_ex, s1) G(k2 + k_ex, k2, s2) - (s1 ==s2)
		// G(k2 + k_ex, k1 + k_ex, s1) G(k1, k2, s1)>
		// = 1/2 (s1 * s2) <G(k1, k1 + k_ex, s1) G(k2 + k_ex, k2, s2) -
		// (s1 == s2) G(k2 + k_ex, k1 + k_ex, s1) G(k1, k2, s1)>.
		for (int w_ex_idx = 0; w_ex_idx < exchange_frq.size(); ++w_ex_idx) {
		const int w_ex = exchange_frq[w_ex_idx];
		for (int w2 = 0; w2 < WTpDmn::dmn_size(); ++w2)
		@@ -412,7 +414,7 @@ double TpAccumulator<Parameters, linalg::CPU>::updateG4() {
		case PARTICLE_HOLE_CHARGE:
		// G4(k1, k2, k_ex) += 1/2 <c^+(k1 + k_ex, s1) c(k1, s1) c^+(k2, s2) c(k2 + k_ex, s2)> =
		// = 1/2 <G(k1, k1 + k_ex, s1) G(k2 + k_ex, k2, s2) -
		// (s1 ==s2) G(k2 + k_ex, k1 + k_ex, s1) G(k1, k2, s1)>
		// (s1 == s2) G(k2 + k_ex, k1 + k_ex, s1) G(k1, k2, s1)>.
		for (int w_ex_idx = 0; w_ex_idx < exchange_frq.size(); ++w_ex_idx) {
		const int w_ex = exchange_frq[w_ex_idx];
		for (int w2 = 0; w2 < WTpDmn::dmn_size(); ++w2)
		@@ -437,8 +439,8 @@ double TpAccumulator<Parameters, linalg::CPU>::updateG4() {
		break;

		case PARTICLE_HOLE_TRANSVERSE:
		// G4 = 1/2 \sum_s <c^+(k1 + k_ex, s) c(k1, -s) c^+(k2, -s) c(k2 + k_ex, s)>
		// = -1/2 \sum_s G( k2 + k_ex, k1 + k_ex, s) G4(k1, k2, -s)
		// G4 = 1/2 <c^+(k1 + k_ex, s) c(k1, -s) c^+(k2, -s) c(k2 + k_ex, s)>
		// = -1/2 G(k2 + k_ex, k1 + k_ex, s) G(k1, k2, -s).
		for (int w_ex_idx = 0; w_ex_idx < exchange_frq.size(); ++w_ex_idx) {
		const int w_ex = exchange_frq[w_ex_idx];
		for (int w2 = 0; w2 < WTpDmn::dmn_size(); ++w2)
		@@ -460,8 +462,8 @@ double TpAccumulator<Parameters, linalg::CPU>::updateG4() {
		break;

		case PARTICLE_PARTICLE_UP_DOWN:
		// G4 = 1/2 \sum_s <c^+(k_ex-k1, s) c^+(k1, -s) c(k2, -s) c(k_ex-k2, s)>
		// = 1/2 \sum_s G(k_ex-k2, k_ex-k1, s) G4(k2, k1, -s)
		// G4 = 1/2 <c^+(k_ex-k1, s) c^+(k1, -s) c(k2, -s) c(k_ex-k2, s)>
		// = 1/2 G(k_ex-k2, k_ex-k1, s) G(k2, k1, -s).
		for (int w_ex_idx = 0; w_ex_idx < exchange_frq.size(); ++w_ex_idx) {
		const int w_ex = exchange_frq[w_ex_idx];
		for (int w2 = 0; w2 < WTpDmn::dmn_size(); ++w2)
		@@ -531,9 +533,9 @@ void TpAccumulator<Parameters, linalg::CPU>::updateG4SpinDifference(
		const bool cross_legs) {
		// This function performs the following update for each band:
		//
		// G4(k1, k2, w1, w2) += alpha * (G(up, k1_a, k2_a, w1_a, w2_a) * G(up, k1_b, k2_b, w1_b, w2_b)
		// + sign * G(down, k1_a, k2_a, w1_a, w2_a) * G(down, k1_b, k2_b, w1_b,
		// w2_b)
		// G4(k1, k2, w1, w2) += alpha * (G(up, k1_a, k2_a, w1_a, w2_a)
		// + sign * G(down,k1_a, k2_a, w1_a, w2_a)) *
		// (G(up,k1_b,k2_b,w1_b,w2_b) + sign * G(down,k1_b,k2_b,w1_b,w2_b))
		if (n_bands_ == 1) {
		G4_ptr += alpha (getGSingleband(0, k1_a, k2_a, w1_a, w2_a) +
		Complex(sign) * getGSingleband(1, k1_a, k2_a, w1_a, w2_a)) *

src/math/nfft/dnfft_1d_kernels.cu

+1 −1

Original line number	Diff line number	Diff line
		@@ -99,7 +99,7 @@ void accumulateOnDevice(const double* M, const int ldm, const ScalarType sign, S
		const ConfigElem* config_right, const ScalarType* tau,
		const ScalarType* cubic_coeff, const int size, cudaStream_t stream_) {
		const auto& helper = HelperSelector<ScalarType>::value;
		const static int convolution_size = 2 * helper.get_oversampling() + 1;
		const int convolution_size = 2 * helper.get_oversampling() + 1;
		const auto blocks = getBlockSize(size * size * convolution_size, 128);

		// TODO: check if there is a performance gain in using a block size that is a multiple of

src/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_kernels.cu

+17 −10

Original line number	Diff line number	Diff line
		@@ -164,9 +164,9 @@ void computeGMultiband(std::complex<Real>* G, int ldg, const std::complex<Real>*
		return candidate;
		return -1;
		};
		const static int width = get_block_width();

		const static auto blocks = getBlockSize(n_rows, n_rows * 2, width);
		const int width = get_block_width();
		const auto blocks = getBlockSize(n_rows, n_rows * 2, width);

		computeGMultibandKernel<<<blocks[0], blocks[1], width * width * sizeof(std::complex<Real>), stream>>>(
		castCudaComplex(G), ldg, castCudaComplex(G0), ldg0, nb, nk, nw_pos, beta);
		@@ -266,7 +266,11 @@ __global__ void updateG4Kernel(CudaComplex<Real>* __restrict__ G4,
		const bool conj_b = helper.extendGIndices(k1_b, k2_b,w1_b, w2_b);
		const int i_b = b2 + nb * k1_b + no * w1_b;
		const int j_b = b3 + nb * k2_b + no * w2_b;


		const CudaComplex<Real> Gb_1 = cond_conj(G_up[i_b + ldgu * j_b], conj_b);


		const CudaComplex<Real> Gb_2 = cond_conj(G_down[i_b + ldgd * j_b], conj_b);

		contribution = -(Ga_1 * Gb_1 + Ga_2 * Gb_2);
		@@ -323,7 +327,10 @@ __global__ void updateG4Kernel(CudaComplex<Real>* __restrict__ G4,
		const int i_b = b2 + nb * k1_b + no * w1_b;
		const int j_b = b3 + nb * k2_b + no * w2_b;


		const CudaComplex<Real> Gb_1 = cond_conj(G_up[i_b + ldgu * j_b], conj_b);


		const CudaComplex<Real> Gb_2 = cond_conj(G_down[i_b + ldgd * j_b], conj_b);

		contribution = -(Ga_1 * Gb_1 + Ga_2 * Gb_2);
		@@ -400,10 +407,10 @@ void updateG4(std::complex<Real>* G4, const std::complex<Real>* G_up, const int
		const std::complex<Real>* G_down, const int ldgd, const int nb, const int nk,
		const int nw_pos, const int nw_exchange, const int nk_exchange, const int sign,
		cudaStream_t stream) {
		const static int nw = 2 * nw_pos;
		const static int size_12 = nw * nk * nb * nb;
		const static int size_3 = nw_exchange * nk_exchange;
		const static auto blocks = getBlockSize3D(size_12, size_12, size_3);
		const int nw = 2 * nw_pos;
		const int size_12 = nw * nk * nb * nb;
		const int size_3 = nw_exchange * nk_exchange;
		const auto blocks = getBlockSize3D(size_12, size_12, size_3);

		updateG4Kernel<Real, type><<<blocks[0], blocks[1], 0, stream>>>(
		castCudaComplex(G4), castCudaComplex(G_up), ldgu, castCudaComplex(G_down), ldgd, nb, nk, nw,