fixing some merge goofs. 4_point_type ordering (44842fe2) · Commits · NDIP / Tool Sources / Direct-Geometry Spectroscopy / DCA / DCA Main

include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator.hpp

+22 −26

Original line number	Diff line number	Diff line
		@@ -182,7 +182,6 @@ TpAccumulator<Parameters, linalg::CPU>::TpAccumulator(
		: G0_ptr_(&G0),
		thread_id_(thread_id),
		multiple_accumulators_(pars.get_accumulators() > 1),
		mode_(pars.get_four_point_type()),
		beta_(pars.get_beta()),
		extension_index_offset_((WTpExtDmn::dmn_size() - WTpDmn::dmn_size()) / 2),
		n_pos_frqs_(WTpExtPosDmn::dmn_size()),
		@@ -434,6 +433,28 @@ double TpAccumulator<Parameters, linalg::CPU>::updateG4(TpGreensFunction& G4) {
		const FourPointType channel = stringToFourPointType(channel_str);

		switch (channel) {
		case PARTICLE_HOLE_TRANSVERSE:
		// G4(k1, k2, k_ex) = 1/2 sum_s <c^+(k1+k_ex, s) c(k1, -s) c^+(k2, -s) c(k2+k_ex, s)>
		// = -1/2 sum_s G(k2+k_ex, k1+k_ex, s) G(k1, k2, -s)
		for (int w_ex_idx = 0; w_ex_idx < exchange_frq.size(); ++w_ex_idx) {
		const int w_ex = exchange_frq[w_ex_idx];
		for (int w2 = 0; w2 < WTpDmn::dmn_size(); ++w2)
		for (int w1 = 0; w1 < WTpDmn::dmn_size(); ++w1)
		for (int k_ex_idx = 0; k_ex_idx < exchange_mom.size(); ++k_ex_idx) {
		const int k_ex = exchange_mom[k_ex_idx];
		for (int k2 = 0; k2 < KDmn::dmn_size(); ++k2)
		for (int k1 = 0; k1 < KDmn::dmn_size(); ++k1) {
		Complex* const G4_ptr = &G4(0, 0, 0, 0, k1, k2, k_ex_idx, w1, w2, w_ex_idx);
		for (int s = 0; s < 2; ++s)
		updateG4Atomic(G4_ptr, s, k1, k2, w1, w2, not s, momentum_sum(k2, k_ex),
		momentum_sum(k1, k_ex), w_plus_w_ex(w2, w_ex),
		w_plus_w_ex(w1, w_ex), -sign_over_2, true);
		}
		}
		}
		flops += n_loops * 2 * flops_update_atomic;
		break;

		case PARTICLE_HOLE_MAGNETIC:
		// G4(k1, k2, k_ex) = 1/2 sum_{s1, s2} (s1 * s2)
		// <c^+(k1+k_ex, s1) c(k1, s1) c^+(k2, s2) c(k2+k_ex, s2)>
		@@ -517,7 +538,6 @@ double TpAccumulator<Parameters, linalg::CPU>::updateG4(TpGreensFunction& G4) {
		}
		}
		}

		flops += n_loops * 4 * flops_update_atomic;
		break;

		@@ -540,33 +560,9 @@ double TpAccumulator<Parameters, linalg::CPU>::updateG4(TpGreensFunction& G4) {
		}
		}
		}

		flops += n_loops * 4 * flops_update_atomic;
		break;

		case PARTICLE_HOLE_TRANSVERSE:
		// G4(k1, k2, k_ex) = 1/2 sum_s <c^+(k1+k_ex, s) c(k1, -s) c^+(k2, -s) c(k2+k_ex, s)>
		// = -1/2 sum_s G(k2+k_ex, k1+k_ex, s) G(k1, k2, -s)
		for (int w_ex_idx = 0; w_ex_idx < exchange_frq.size(); ++w_ex_idx) {
		const int w_ex = exchange_frq[w_ex_idx];
		for (int w2 = 0; w2 < WTpDmn::dmn_size(); ++w2)
		for (int w1 = 0; w1 < WTpDmn::dmn_size(); ++w1)
		for (int k_ex_idx = 0; k_ex_idx < exchange_mom.size(); ++k_ex_idx) {
		const int k_ex = exchange_mom[k_ex_idx];
		for (int k2 = 0; k2 < KDmn::dmn_size(); ++k2)
		for (int k1 = 0; k1 < KDmn::dmn_size(); ++k1) {
		Complex* const G4_ptr = &G4(0, 0, 0, 0, k1, k2, k_ex_idx, w1, w2, w_ex_idx);
		for (int s = 0; s < 2; ++s)
		updateG4Atomic(G4_ptr, s, k1, k2, w1, w2, not s, momentum_sum(k2, k_ex),
		momentum_sum(k1, k_ex), w_plus_w_ex(w2, w_ex),
		w_plus_w_ex(w1, w_ex), -sign_over_2, true);
		}
		}
		}

		flops += n_loops * 2 * flops_update_atomic;
		break;

		case PARTICLE_PARTICLE_UP_DOWN:
		// G4(k1, k2, k_ex) = 1/2 sum_s <c^+(k_ex-k1, s) c^+(k1, -s) c(k2, -s) c(k_ex-k2, s)>
		// = 1/2 sum_s G(k_ex-k2, k_ex-k1, s) G(k2, k1, -s)

include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_gpu.hpp

+8 −8

Original line number	Diff line number	Diff line
		@@ -395,6 +395,12 @@ void TpAccumulator<Parameters, linalg::GPU>::updateG4(const std::size_t channel_
		const FourPointType channel = stringToFourPointType(channel_str);

		switch (channel) {
		case PARTICLE_HOLE_TRANSVERSE:
		details::updateG4<Real, PARTICLE_HOLE_TRANSVERSE>(
		get_G4()[channel_index].ptr(), G_[0].ptr(), G_[0].leadingDimension(), G_[1].ptr(),
		G_[1].leadingDimension(), n_bands_, KDmn::dmn_size(), WTpPosDmn::dmn_size(), nw_exchange,
		nk_exchange, sign_, multiple_accumulators_, streams_[0]);
		break;
		case PARTICLE_HOLE_MAGNETIC:
		details::updateG4<Real, PARTICLE_HOLE_MAGNETIC>(
		get_G4()[channel_index].ptr(), G_[0].ptr(), G_[0].leadingDimension(), G_[1].ptr(),
		@@ -405,13 +411,13 @@ void TpAccumulator<Parameters, linalg::GPU>::updateG4(const std::size_t channel_
		details::updateG4<Real, PARTICLE_HOLE_CHARGE>(
		get_G4()[channel_index].ptr(), G_[0].ptr(), G_[0].leadingDimension(), G_[1].ptr(),
		G_[1].leadingDimension(), n_bands_, KDmn::dmn_size(), WTpPosDmn::dmn_size(), nw_exchange,
		nk_exchange, sign_, streams_[0]);
		nk_exchange, sign_, multiple_accumulators_, streams_[0]);
		break;
		case PARTICLE_HOLE_LONGITUDINAL_UP_UP:
		details::updateG4<Real, PARTICLE_HOLE_LONGITUDINAL_UP_UP>(
		get_G4()[channel_index].ptr(), G_[0].ptr(), G_[0].leadingDimension(), G_[1].ptr(),
		G_[1].leadingDimension(), n_bands_, KDmn::dmn_size(), WTpPosDmn::dmn_size(), nw_exchange,
		nk_exchange, sign_, streams_[0]);
		nk_exchange, sign_, multiple_accumulators_, streams_[0]);
		break;
		case PARTICLE_HOLE_LONGITUDINAL_UP_DOWN:
		details::updateG4<Real, PARTICLE_HOLE_LONGITUDINAL_UP_DOWN>(
		@@ -419,12 +425,6 @@ void TpAccumulator<Parameters, linalg::GPU>::updateG4(const std::size_t channel_
		G_[1].leadingDimension(), n_bands_, KDmn::dmn_size(), WTpPosDmn::dmn_size(), nw_exchange,
		nk_exchange, sign_, multiple_accumulators_, streams_[0]);
		break;
		case PARTICLE_HOLE_TRANSVERSE:
		details::updateG4<Real, PARTICLE_HOLE_TRANSVERSE>(
		get_G4()[channel_index].ptr(), G_[0].ptr(), G_[0].leadingDimension(), G_[1].ptr(),
		G_[1].leadingDimension(), n_bands_, KDmn::dmn_size(), WTpPosDmn::dmn_size(), nw_exchange,
		nk_exchange, sign_, multiple_accumulators_, streams_[0]);
		break;
		case PARTICLE_PARTICLE_UP_DOWN:
		details::updateG4<Real, PARTICLE_PARTICLE_UP_DOWN>(
		get_G4()[channel_index].ptr(), G_[0].ptr(), G_[0].leadingDimension(), G_[1].ptr(),

src/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_kernels.cu

+38 −18

Original line number	Diff line number	Diff line
		@@ -368,21 +368,21 @@ __global__ void updateG4Kernel(CudaComplex<Real>* __restrict__ G4,
		case PARTICLE_HOLE_LONGITUDINAL_UP_UP: {
		// contribution <- \sum_s G(k1, k1+k_ex, s) * G(k2+k_ex, k2, s)
		int w1_a(w1);
		int w2_a(helper.addWex(w1, w_ex));
		int w2_a(g4_helper.addWex(w1, w_ex));
		int k1_a = k1;
		int k2_a = helper.addKex(k1, k_ex);
		const bool conj_a = helper.extendGIndices(k1_a, k2_a, w1_a, w2_a);
		int k2_a = g4_helper.addKex(k1, k_ex);
		const bool conj_a = g4_helper.extendGIndices(k1_a, k2_a, w1_a, w2_a);
		const int i_a = b1 + nb * k1_a + no * w1_a;
		const int j_a = b3 + nb * k2_a + no * w2_a;

		const CudaComplex<Real> Ga_1 = cond_conj(G_up[i_a + ldgu * j_a], conj_a);
		const CudaComplex<Real> Ga_2 = cond_conj(G_down[i_a + ldgd * j_a], conj_a);

		int w1_b(helper.addWex(w2, w_ex));
		int w1_b(g4_helper.addWex(w2, w_ex));
		int w2_b(w2);
		int k1_b = helper.addKex(k2, k_ex);
		int k1_b = g4_helper.addKex(k2, k_ex);
		int k2_b = k2;
		const bool conj_b = helper.extendGIndices(k1_b, k2_b, w1_b, w2_b);
		const bool conj_b = g4_helper.extendGIndices(k1_b, k2_b, w1_b, w2_b);
		const int i_b = b2 + nb * k1_b + no * w1_b;
		const int j_b = b4 + nb * k2_b + no * w2_b;

		@@ -397,18 +397,18 @@ __global__ void updateG4Kernel(CudaComplex<Real>* __restrict__ G4,
		int w2_a(w2);
		int k1_a(k1);
		int k2_a(k2);
		const bool conj_a = helper.extendGIndices(k1_a, k2_a, w1_a, w2_a);
		const bool conj_a = g4_helper.extendGIndices(k1_a, k2_a, w1_a, w2_a);
		const int i_a = b1 + nb * k1_a + no * w1_a;
		const int j_a = b4 + nb * k2_a + no * w2_a;

		const CudaComplex<Real> Ga_1 = cond_conj(G_up[i_a + ldgu * j_a], conj_a);
		const CudaComplex<Real> Ga_2 = cond_conj(G_down[i_a + ldgd * j_a], conj_a);

		int w1_b(helper.addWex(w2, w_ex));
		int w2_b(helper.addWex(w1, w_ex));
		int k1_b = helper.addKex(k2, k_ex);
		int k2_b = helper.addKex(k1, k_ex);
		const bool conj_b = helper.extendGIndices(k1_b, k2_b, w1_b, w2_b);
		int w1_b(g4_helper.addWex(w2, w_ex));
		int w2_b(g4_helper.addWex(w1, w_ex));
		int k1_b = g4_helper.addKex(k2, k_ex);
		int k2_b = g4_helper.addKex(k1, k_ex);
		const bool conj_b = g4_helper.extendGIndices(k1_b, k2_b, w1_b, w2_b);
		const int i_b = b2 + nb * k1_b + no * w1_b;
		const int j_b = b3 + nb * k2_b + no * w2_b;

		@@ -422,21 +422,21 @@ __global__ void updateG4Kernel(CudaComplex<Real>* __restrict__ G4,
		case PARTICLE_HOLE_LONGITUDINAL_UP_DOWN: {
		// contribution <- \sum_s G(k1, k1+k_ex, s) * G(k2+k_ex, k2, -s)
		int w1_a(w1);
		int w2_a(helper.addWex(w1, w_ex));
		int w2_a(g4_helper.addWex(w1, w_ex));
		int k1_a = k1;
		int k2_a = helper.addKex(k1, k_ex);
		const bool conj_a = helper.extendGIndices(k1_a, k2_a, w1_a, w2_a);
		int k2_a = g4_helper.addKex(k1, k_ex);
		const bool conj_a = g4_helper.extendGIndices(k1_a, k2_a, w1_a, w2_a);
		const int i_a = b1 + nb * k1_a + no * w1_a;
		const int j_a = b3 + nb * k2_a + no * w2_a;

		const CudaComplex<Real> Ga_1 = cond_conj(G_up[i_a + ldgu * j_a], conj_a);
		const CudaComplex<Real> Ga_2 = cond_conj(G_down[i_a + ldgd * j_a], conj_a);

		int w1_b(helper.addWex(w2, w_ex));
		int w1_b(g4_helper.addWex(w2, w_ex));
		int w2_b(w2);
		int k1_b = helper.addKex(k2, k_ex);
		int k1_b = g4_helper.addKex(k2, k_ex);
		int k2_b = k2;
		const bool conj_b = helper.extendGIndices(k1_b, k2_b, w1_b, w2_b);
		const bool conj_b = g4_helper.extendGIndices(k1_b, k2_b, w1_b, w2_b);
		const int i_b = b2 + nb * k1_b + no * w1_b;
		const int j_b = b4 + nb * k2_b + no * w2_b;

		@@ -537,6 +537,16 @@ template void updateG4<float, PARTICLE_HOLE_CHARGE>(
		const std::complex<float>* G_down, const int ldgd, const int nb, const int nk, const int nw_pos,
		const int nw_exchange, const int nk_exchange, const int sign, bool atomic, cudaStream_t stream);

		template void updateG4<float, PARTICLE_HOLE_LONGITUDINAL_UP_UP>(
		std::complex<float>* G4, const std::complex<float>* G_up, const int ldgu,
		const std::complex<float>* G_down, const int ldgd, const int nb, const int nk, const int nw_pos,
		const int nw_exchange, const int nk_exchange, const int sign, bool atomic, cudaStream_t stream);

		template void updateG4<float, PARTICLE_HOLE_LONGITUDINAL_UP_DOWN>(
		std::complex<float>* G4, const std::complex<float>* G_up, const int ldgu,
		const std::complex<float>* G_down, const int ldgd, const int nb, const int nk, const int nw_pos,
		const int nw_exchange, const int nk_exchange, const int sign, bool atomic, cudaStream_t stream);

		template void updateG4<float, PARTICLE_PARTICLE_UP_DOWN>(
		std::complex<float>* G4, const std::complex<float>* G_up, const int ldgu,
		const std::complex<float>* G_down, const int ldgd, const int nb, const int nk, const int nw_pos,
		@@ -557,6 +567,16 @@ template void updateG4<double, PARTICLE_HOLE_CHARGE>(
		const std::complex<double>* G_down, const int ldgd, const int nb, const int nk, const int nw_pos,
		const int nw_exchange, const int nk_exchange, const int sign, bool atomic, cudaStream_t stream);

		template void updateG4<double, PARTICLE_HOLE_LONGITUDINAL_UP_UP>(
		std::complex<double>* G4, const std::complex<double>* G_up, const int ldgu,
		const std::complex<double>* G_down, const int ldgd, const int nb, const int nk, const int nw_pos,
		const int nw_exchange, const int nk_exchange, const int sign, bool atomic, cudaStream_t stream);

		template void updateG4<double, PARTICLE_HOLE_LONGITUDINAL_UP_DOWN>(
		std::complex<double>* G4, const std::complex<double>* G_up, const int ldgu,
		const std::complex<double>* G_down, const int ldgd, const int nb, const int nk, const int nw_pos,
		const int nw_exchange, const int nk_exchange, const int sign, bool atomic, cudaStream_t stream);

		template void updateG4<double, PARTICLE_PARTICLE_UP_DOWN>(
		std::complex<double>* G4, const std::complex<double>* G_up, const int ldgu,
		const std::complex<double>* G_down, const int ldgd, const int nb, const int nk, const int nw_pos,