Loading include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator.hpp +22 −26 Original line number Diff line number Diff line Loading @@ -182,7 +182,6 @@ TpAccumulator<Parameters, linalg::CPU>::TpAccumulator( : G0_ptr_(&G0), thread_id_(thread_id), multiple_accumulators_(pars.get_accumulators() > 1), mode_(pars.get_four_point_type()), beta_(pars.get_beta()), extension_index_offset_((WTpExtDmn::dmn_size() - WTpDmn::dmn_size()) / 2), n_pos_frqs_(WTpExtPosDmn::dmn_size()), Loading Loading @@ -434,6 +433,28 @@ double TpAccumulator<Parameters, linalg::CPU>::updateG4(TpGreensFunction& G4) { const FourPointType channel = stringToFourPointType(channel_str); switch (channel) { case PARTICLE_HOLE_TRANSVERSE: // G4(k1, k2, k_ex) = 1/2 sum_s <c^+(k1+k_ex, s) c(k1, -s) c^+(k2, -s) c(k2+k_ex, s)> // = -1/2 sum_s G(k2+k_ex, k1+k_ex, s) G(k1, k2, -s) for (int w_ex_idx = 0; w_ex_idx < exchange_frq.size(); ++w_ex_idx) { const int w_ex = exchange_frq[w_ex_idx]; for (int w2 = 0; w2 < WTpDmn::dmn_size(); ++w2) for (int w1 = 0; w1 < WTpDmn::dmn_size(); ++w1) for (int k_ex_idx = 0; k_ex_idx < exchange_mom.size(); ++k_ex_idx) { const int k_ex = exchange_mom[k_ex_idx]; for (int k2 = 0; k2 < KDmn::dmn_size(); ++k2) for (int k1 = 0; k1 < KDmn::dmn_size(); ++k1) { Complex* const G4_ptr = &G4(0, 0, 0, 0, k1, k2, k_ex_idx, w1, w2, w_ex_idx); for (int s = 0; s < 2; ++s) updateG4Atomic(G4_ptr, s, k1, k2, w1, w2, not s, momentum_sum(k2, k_ex), momentum_sum(k1, k_ex), w_plus_w_ex(w2, w_ex), w_plus_w_ex(w1, w_ex), -sign_over_2, true); } } } flops += n_loops * 2 * flops_update_atomic; break; case PARTICLE_HOLE_MAGNETIC: // G4(k1, k2, k_ex) = 1/2 sum_{s1, s2} (s1 * s2) // <c^+(k1+k_ex, s1) c(k1, s1) c^+(k2, s2) c(k2+k_ex, s2)> Loading Loading @@ -517,7 +538,6 @@ double TpAccumulator<Parameters, linalg::CPU>::updateG4(TpGreensFunction& G4) { } } } flops += n_loops * 4 * flops_update_atomic; break; Loading @@ -540,33 +560,9 @@ double TpAccumulator<Parameters, linalg::CPU>::updateG4(TpGreensFunction& G4) { } } } flops += n_loops * 4 * flops_update_atomic; break; case PARTICLE_HOLE_TRANSVERSE: // G4(k1, k2, k_ex) = 1/2 sum_s <c^+(k1+k_ex, s) c(k1, -s) c^+(k2, -s) c(k2+k_ex, s)> // = -1/2 sum_s G(k2+k_ex, k1+k_ex, s) G(k1, k2, -s) for (int w_ex_idx = 0; w_ex_idx < exchange_frq.size(); ++w_ex_idx) { const int w_ex = exchange_frq[w_ex_idx]; for (int w2 = 0; w2 < WTpDmn::dmn_size(); ++w2) for (int w1 = 0; w1 < WTpDmn::dmn_size(); ++w1) for (int k_ex_idx = 0; k_ex_idx < exchange_mom.size(); ++k_ex_idx) { const int k_ex = exchange_mom[k_ex_idx]; for (int k2 = 0; k2 < KDmn::dmn_size(); ++k2) for (int k1 = 0; k1 < KDmn::dmn_size(); ++k1) { Complex* const G4_ptr = &G4(0, 0, 0, 0, k1, k2, k_ex_idx, w1, w2, w_ex_idx); for (int s = 0; s < 2; ++s) updateG4Atomic(G4_ptr, s, k1, k2, w1, w2, not s, momentum_sum(k2, k_ex), momentum_sum(k1, k_ex), w_plus_w_ex(w2, w_ex), w_plus_w_ex(w1, w_ex), -sign_over_2, true); } } } flops += n_loops * 2 * flops_update_atomic; break; case PARTICLE_PARTICLE_UP_DOWN: // G4(k1, k2, k_ex) = 1/2 sum_s <c^+(k_ex-k1, s) c^+(k1, -s) c(k2, -s) c(k_ex-k2, s)> // = 1/2 sum_s G(k_ex-k2, k_ex-k1, s) G(k2, k1, -s) Loading include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_gpu.hpp +8 −8 Original line number Diff line number Diff line Loading @@ -395,6 +395,12 @@ void TpAccumulator<Parameters, linalg::GPU>::updateG4(const std::size_t channel_ const FourPointType channel = stringToFourPointType(channel_str); switch (channel) { case PARTICLE_HOLE_TRANSVERSE: details::updateG4<Real, PARTICLE_HOLE_TRANSVERSE>( get_G4()[channel_index].ptr(), G_[0].ptr(), G_[0].leadingDimension(), G_[1].ptr(), G_[1].leadingDimension(), n_bands_, KDmn::dmn_size(), WTpPosDmn::dmn_size(), nw_exchange, nk_exchange, sign_, multiple_accumulators_, streams_[0]); break; case PARTICLE_HOLE_MAGNETIC: details::updateG4<Real, PARTICLE_HOLE_MAGNETIC>( get_G4()[channel_index].ptr(), G_[0].ptr(), G_[0].leadingDimension(), G_[1].ptr(), Loading @@ -405,13 +411,13 @@ void TpAccumulator<Parameters, linalg::GPU>::updateG4(const std::size_t channel_ details::updateG4<Real, PARTICLE_HOLE_CHARGE>( get_G4()[channel_index].ptr(), G_[0].ptr(), G_[0].leadingDimension(), G_[1].ptr(), G_[1].leadingDimension(), n_bands_, KDmn::dmn_size(), WTpPosDmn::dmn_size(), nw_exchange, nk_exchange, sign_, streams_[0]); nk_exchange, sign_, multiple_accumulators_, streams_[0]); break; case PARTICLE_HOLE_LONGITUDINAL_UP_UP: details::updateG4<Real, PARTICLE_HOLE_LONGITUDINAL_UP_UP>( get_G4()[channel_index].ptr(), G_[0].ptr(), G_[0].leadingDimension(), G_[1].ptr(), G_[1].leadingDimension(), n_bands_, KDmn::dmn_size(), WTpPosDmn::dmn_size(), nw_exchange, nk_exchange, sign_, streams_[0]); nk_exchange, sign_, multiple_accumulators_, streams_[0]); break; case PARTICLE_HOLE_LONGITUDINAL_UP_DOWN: details::updateG4<Real, PARTICLE_HOLE_LONGITUDINAL_UP_DOWN>( Loading @@ -419,12 +425,6 @@ void TpAccumulator<Parameters, linalg::GPU>::updateG4(const std::size_t channel_ G_[1].leadingDimension(), n_bands_, KDmn::dmn_size(), WTpPosDmn::dmn_size(), nw_exchange, nk_exchange, sign_, multiple_accumulators_, streams_[0]); break; case PARTICLE_HOLE_TRANSVERSE: details::updateG4<Real, PARTICLE_HOLE_TRANSVERSE>( get_G4()[channel_index].ptr(), G_[0].ptr(), G_[0].leadingDimension(), G_[1].ptr(), G_[1].leadingDimension(), n_bands_, KDmn::dmn_size(), WTpPosDmn::dmn_size(), nw_exchange, nk_exchange, sign_, multiple_accumulators_, streams_[0]); break; case PARTICLE_PARTICLE_UP_DOWN: details::updateG4<Real, PARTICLE_PARTICLE_UP_DOWN>( get_G4()[channel_index].ptr(), G_[0].ptr(), G_[0].leadingDimension(), G_[1].ptr(), Loading src/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_kernels.cu +38 −18 Original line number Diff line number Diff line Loading @@ -368,21 +368,21 @@ __global__ void updateG4Kernel(CudaComplex<Real>* __restrict__ G4, case PARTICLE_HOLE_LONGITUDINAL_UP_UP: { // contribution <- \sum_s G(k1, k1+k_ex, s) * G(k2+k_ex, k2, s) int w1_a(w1); int w2_a(helper.addWex(w1, w_ex)); int w2_a(g4_helper.addWex(w1, w_ex)); int k1_a = k1; int k2_a = helper.addKex(k1, k_ex); const bool conj_a = helper.extendGIndices(k1_a, k2_a, w1_a, w2_a); int k2_a = g4_helper.addKex(k1, k_ex); const bool conj_a = g4_helper.extendGIndices(k1_a, k2_a, w1_a, w2_a); const int i_a = b1 + nb * k1_a + no * w1_a; const int j_a = b3 + nb * k2_a + no * w2_a; const CudaComplex<Real> Ga_1 = cond_conj(G_up[i_a + ldgu * j_a], conj_a); const CudaComplex<Real> Ga_2 = cond_conj(G_down[i_a + ldgd * j_a], conj_a); int w1_b(helper.addWex(w2, w_ex)); int w1_b(g4_helper.addWex(w2, w_ex)); int w2_b(w2); int k1_b = helper.addKex(k2, k_ex); int k1_b = g4_helper.addKex(k2, k_ex); int k2_b = k2; const bool conj_b = helper.extendGIndices(k1_b, k2_b, w1_b, w2_b); const bool conj_b = g4_helper.extendGIndices(k1_b, k2_b, w1_b, w2_b); const int i_b = b2 + nb * k1_b + no * w1_b; const int j_b = b4 + nb * k2_b + no * w2_b; Loading @@ -397,18 +397,18 @@ __global__ void updateG4Kernel(CudaComplex<Real>* __restrict__ G4, int w2_a(w2); int k1_a(k1); int k2_a(k2); const bool conj_a = helper.extendGIndices(k1_a, k2_a, w1_a, w2_a); const bool conj_a = g4_helper.extendGIndices(k1_a, k2_a, w1_a, w2_a); const int i_a = b1 + nb * k1_a + no * w1_a; const int j_a = b4 + nb * k2_a + no * w2_a; const CudaComplex<Real> Ga_1 = cond_conj(G_up[i_a + ldgu * j_a], conj_a); const CudaComplex<Real> Ga_2 = cond_conj(G_down[i_a + ldgd * j_a], conj_a); int w1_b(helper.addWex(w2, w_ex)); int w2_b(helper.addWex(w1, w_ex)); int k1_b = helper.addKex(k2, k_ex); int k2_b = helper.addKex(k1, k_ex); const bool conj_b = helper.extendGIndices(k1_b, k2_b, w1_b, w2_b); int w1_b(g4_helper.addWex(w2, w_ex)); int w2_b(g4_helper.addWex(w1, w_ex)); int k1_b = g4_helper.addKex(k2, k_ex); int k2_b = g4_helper.addKex(k1, k_ex); const bool conj_b = g4_helper.extendGIndices(k1_b, k2_b, w1_b, w2_b); const int i_b = b2 + nb * k1_b + no * w1_b; const int j_b = b3 + nb * k2_b + no * w2_b; Loading @@ -422,21 +422,21 @@ __global__ void updateG4Kernel(CudaComplex<Real>* __restrict__ G4, case PARTICLE_HOLE_LONGITUDINAL_UP_DOWN: { // contribution <- \sum_s G(k1, k1+k_ex, s) * G(k2+k_ex, k2, -s) int w1_a(w1); int w2_a(helper.addWex(w1, w_ex)); int w2_a(g4_helper.addWex(w1, w_ex)); int k1_a = k1; int k2_a = helper.addKex(k1, k_ex); const bool conj_a = helper.extendGIndices(k1_a, k2_a, w1_a, w2_a); int k2_a = g4_helper.addKex(k1, k_ex); const bool conj_a = g4_helper.extendGIndices(k1_a, k2_a, w1_a, w2_a); const int i_a = b1 + nb * k1_a + no * w1_a; const int j_a = b3 + nb * k2_a + no * w2_a; const CudaComplex<Real> Ga_1 = cond_conj(G_up[i_a + ldgu * j_a], conj_a); const CudaComplex<Real> Ga_2 = cond_conj(G_down[i_a + ldgd * j_a], conj_a); int w1_b(helper.addWex(w2, w_ex)); int w1_b(g4_helper.addWex(w2, w_ex)); int w2_b(w2); int k1_b = helper.addKex(k2, k_ex); int k1_b = g4_helper.addKex(k2, k_ex); int k2_b = k2; const bool conj_b = helper.extendGIndices(k1_b, k2_b, w1_b, w2_b); const bool conj_b = g4_helper.extendGIndices(k1_b, k2_b, w1_b, w2_b); const int i_b = b2 + nb * k1_b + no * w1_b; const int j_b = b4 + nb * k2_b + no * w2_b; Loading Loading @@ -537,6 +537,16 @@ template void updateG4<float, PARTICLE_HOLE_CHARGE>( const std::complex<float>* G_down, const int ldgd, const int nb, const int nk, const int nw_pos, const int nw_exchange, const int nk_exchange, const int sign, bool atomic, cudaStream_t stream); template void updateG4<float, PARTICLE_HOLE_LONGITUDINAL_UP_UP>( std::complex<float>* G4, const std::complex<float>* G_up, const int ldgu, const std::complex<float>* G_down, const int ldgd, const int nb, const int nk, const int nw_pos, const int nw_exchange, const int nk_exchange, const int sign, bool atomic, cudaStream_t stream); template void updateG4<float, PARTICLE_HOLE_LONGITUDINAL_UP_DOWN>( std::complex<float>* G4, const std::complex<float>* G_up, const int ldgu, const std::complex<float>* G_down, const int ldgd, const int nb, const int nk, const int nw_pos, const int nw_exchange, const int nk_exchange, const int sign, bool atomic, cudaStream_t stream); template void updateG4<float, PARTICLE_PARTICLE_UP_DOWN>( std::complex<float>* G4, const std::complex<float>* G_up, const int ldgu, const std::complex<float>* G_down, const int ldgd, const int nb, const int nk, const int nw_pos, Loading @@ -557,6 +567,16 @@ template void updateG4<double, PARTICLE_HOLE_CHARGE>( const std::complex<double>* G_down, const int ldgd, const int nb, const int nk, const int nw_pos, const int nw_exchange, const int nk_exchange, const int sign, bool atomic, cudaStream_t stream); template void updateG4<double, PARTICLE_HOLE_LONGITUDINAL_UP_UP>( std::complex<double>* G4, const std::complex<double>* G_up, const int ldgu, const std::complex<double>* G_down, const int ldgd, const int nb, const int nk, const int nw_pos, const int nw_exchange, const int nk_exchange, const int sign, bool atomic, cudaStream_t stream); template void updateG4<double, PARTICLE_HOLE_LONGITUDINAL_UP_DOWN>( std::complex<double>* G4, const std::complex<double>* G_up, const int ldgu, const std::complex<double>* G_down, const int ldgd, const int nb, const int nk, const int nw_pos, const int nw_exchange, const int nk_exchange, const int sign, bool atomic, cudaStream_t stream); template void updateG4<double, PARTICLE_PARTICLE_UP_DOWN>( std::complex<double>* G4, const std::complex<double>* G_up, const int ldgu, const std::complex<double>* G_down, const int ldgd, const int nb, const int nk, const int nw_pos, Loading Loading
include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator.hpp +22 −26 Original line number Diff line number Diff line Loading @@ -182,7 +182,6 @@ TpAccumulator<Parameters, linalg::CPU>::TpAccumulator( : G0_ptr_(&G0), thread_id_(thread_id), multiple_accumulators_(pars.get_accumulators() > 1), mode_(pars.get_four_point_type()), beta_(pars.get_beta()), extension_index_offset_((WTpExtDmn::dmn_size() - WTpDmn::dmn_size()) / 2), n_pos_frqs_(WTpExtPosDmn::dmn_size()), Loading Loading @@ -434,6 +433,28 @@ double TpAccumulator<Parameters, linalg::CPU>::updateG4(TpGreensFunction& G4) { const FourPointType channel = stringToFourPointType(channel_str); switch (channel) { case PARTICLE_HOLE_TRANSVERSE: // G4(k1, k2, k_ex) = 1/2 sum_s <c^+(k1+k_ex, s) c(k1, -s) c^+(k2, -s) c(k2+k_ex, s)> // = -1/2 sum_s G(k2+k_ex, k1+k_ex, s) G(k1, k2, -s) for (int w_ex_idx = 0; w_ex_idx < exchange_frq.size(); ++w_ex_idx) { const int w_ex = exchange_frq[w_ex_idx]; for (int w2 = 0; w2 < WTpDmn::dmn_size(); ++w2) for (int w1 = 0; w1 < WTpDmn::dmn_size(); ++w1) for (int k_ex_idx = 0; k_ex_idx < exchange_mom.size(); ++k_ex_idx) { const int k_ex = exchange_mom[k_ex_idx]; for (int k2 = 0; k2 < KDmn::dmn_size(); ++k2) for (int k1 = 0; k1 < KDmn::dmn_size(); ++k1) { Complex* const G4_ptr = &G4(0, 0, 0, 0, k1, k2, k_ex_idx, w1, w2, w_ex_idx); for (int s = 0; s < 2; ++s) updateG4Atomic(G4_ptr, s, k1, k2, w1, w2, not s, momentum_sum(k2, k_ex), momentum_sum(k1, k_ex), w_plus_w_ex(w2, w_ex), w_plus_w_ex(w1, w_ex), -sign_over_2, true); } } } flops += n_loops * 2 * flops_update_atomic; break; case PARTICLE_HOLE_MAGNETIC: // G4(k1, k2, k_ex) = 1/2 sum_{s1, s2} (s1 * s2) // <c^+(k1+k_ex, s1) c(k1, s1) c^+(k2, s2) c(k2+k_ex, s2)> Loading Loading @@ -517,7 +538,6 @@ double TpAccumulator<Parameters, linalg::CPU>::updateG4(TpGreensFunction& G4) { } } } flops += n_loops * 4 * flops_update_atomic; break; Loading @@ -540,33 +560,9 @@ double TpAccumulator<Parameters, linalg::CPU>::updateG4(TpGreensFunction& G4) { } } } flops += n_loops * 4 * flops_update_atomic; break; case PARTICLE_HOLE_TRANSVERSE: // G4(k1, k2, k_ex) = 1/2 sum_s <c^+(k1+k_ex, s) c(k1, -s) c^+(k2, -s) c(k2+k_ex, s)> // = -1/2 sum_s G(k2+k_ex, k1+k_ex, s) G(k1, k2, -s) for (int w_ex_idx = 0; w_ex_idx < exchange_frq.size(); ++w_ex_idx) { const int w_ex = exchange_frq[w_ex_idx]; for (int w2 = 0; w2 < WTpDmn::dmn_size(); ++w2) for (int w1 = 0; w1 < WTpDmn::dmn_size(); ++w1) for (int k_ex_idx = 0; k_ex_idx < exchange_mom.size(); ++k_ex_idx) { const int k_ex = exchange_mom[k_ex_idx]; for (int k2 = 0; k2 < KDmn::dmn_size(); ++k2) for (int k1 = 0; k1 < KDmn::dmn_size(); ++k1) { Complex* const G4_ptr = &G4(0, 0, 0, 0, k1, k2, k_ex_idx, w1, w2, w_ex_idx); for (int s = 0; s < 2; ++s) updateG4Atomic(G4_ptr, s, k1, k2, w1, w2, not s, momentum_sum(k2, k_ex), momentum_sum(k1, k_ex), w_plus_w_ex(w2, w_ex), w_plus_w_ex(w1, w_ex), -sign_over_2, true); } } } flops += n_loops * 2 * flops_update_atomic; break; case PARTICLE_PARTICLE_UP_DOWN: // G4(k1, k2, k_ex) = 1/2 sum_s <c^+(k_ex-k1, s) c^+(k1, -s) c(k2, -s) c(k_ex-k2, s)> // = 1/2 sum_s G(k_ex-k2, k_ex-k1, s) G(k2, k1, -s) Loading
include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_gpu.hpp +8 −8 Original line number Diff line number Diff line Loading @@ -395,6 +395,12 @@ void TpAccumulator<Parameters, linalg::GPU>::updateG4(const std::size_t channel_ const FourPointType channel = stringToFourPointType(channel_str); switch (channel) { case PARTICLE_HOLE_TRANSVERSE: details::updateG4<Real, PARTICLE_HOLE_TRANSVERSE>( get_G4()[channel_index].ptr(), G_[0].ptr(), G_[0].leadingDimension(), G_[1].ptr(), G_[1].leadingDimension(), n_bands_, KDmn::dmn_size(), WTpPosDmn::dmn_size(), nw_exchange, nk_exchange, sign_, multiple_accumulators_, streams_[0]); break; case PARTICLE_HOLE_MAGNETIC: details::updateG4<Real, PARTICLE_HOLE_MAGNETIC>( get_G4()[channel_index].ptr(), G_[0].ptr(), G_[0].leadingDimension(), G_[1].ptr(), Loading @@ -405,13 +411,13 @@ void TpAccumulator<Parameters, linalg::GPU>::updateG4(const std::size_t channel_ details::updateG4<Real, PARTICLE_HOLE_CHARGE>( get_G4()[channel_index].ptr(), G_[0].ptr(), G_[0].leadingDimension(), G_[1].ptr(), G_[1].leadingDimension(), n_bands_, KDmn::dmn_size(), WTpPosDmn::dmn_size(), nw_exchange, nk_exchange, sign_, streams_[0]); nk_exchange, sign_, multiple_accumulators_, streams_[0]); break; case PARTICLE_HOLE_LONGITUDINAL_UP_UP: details::updateG4<Real, PARTICLE_HOLE_LONGITUDINAL_UP_UP>( get_G4()[channel_index].ptr(), G_[0].ptr(), G_[0].leadingDimension(), G_[1].ptr(), G_[1].leadingDimension(), n_bands_, KDmn::dmn_size(), WTpPosDmn::dmn_size(), nw_exchange, nk_exchange, sign_, streams_[0]); nk_exchange, sign_, multiple_accumulators_, streams_[0]); break; case PARTICLE_HOLE_LONGITUDINAL_UP_DOWN: details::updateG4<Real, PARTICLE_HOLE_LONGITUDINAL_UP_DOWN>( Loading @@ -419,12 +425,6 @@ void TpAccumulator<Parameters, linalg::GPU>::updateG4(const std::size_t channel_ G_[1].leadingDimension(), n_bands_, KDmn::dmn_size(), WTpPosDmn::dmn_size(), nw_exchange, nk_exchange, sign_, multiple_accumulators_, streams_[0]); break; case PARTICLE_HOLE_TRANSVERSE: details::updateG4<Real, PARTICLE_HOLE_TRANSVERSE>( get_G4()[channel_index].ptr(), G_[0].ptr(), G_[0].leadingDimension(), G_[1].ptr(), G_[1].leadingDimension(), n_bands_, KDmn::dmn_size(), WTpPosDmn::dmn_size(), nw_exchange, nk_exchange, sign_, multiple_accumulators_, streams_[0]); break; case PARTICLE_PARTICLE_UP_DOWN: details::updateG4<Real, PARTICLE_PARTICLE_UP_DOWN>( get_G4()[channel_index].ptr(), G_[0].ptr(), G_[0].leadingDimension(), G_[1].ptr(), Loading
src/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_kernels.cu +38 −18 Original line number Diff line number Diff line Loading @@ -368,21 +368,21 @@ __global__ void updateG4Kernel(CudaComplex<Real>* __restrict__ G4, case PARTICLE_HOLE_LONGITUDINAL_UP_UP: { // contribution <- \sum_s G(k1, k1+k_ex, s) * G(k2+k_ex, k2, s) int w1_a(w1); int w2_a(helper.addWex(w1, w_ex)); int w2_a(g4_helper.addWex(w1, w_ex)); int k1_a = k1; int k2_a = helper.addKex(k1, k_ex); const bool conj_a = helper.extendGIndices(k1_a, k2_a, w1_a, w2_a); int k2_a = g4_helper.addKex(k1, k_ex); const bool conj_a = g4_helper.extendGIndices(k1_a, k2_a, w1_a, w2_a); const int i_a = b1 + nb * k1_a + no * w1_a; const int j_a = b3 + nb * k2_a + no * w2_a; const CudaComplex<Real> Ga_1 = cond_conj(G_up[i_a + ldgu * j_a], conj_a); const CudaComplex<Real> Ga_2 = cond_conj(G_down[i_a + ldgd * j_a], conj_a); int w1_b(helper.addWex(w2, w_ex)); int w1_b(g4_helper.addWex(w2, w_ex)); int w2_b(w2); int k1_b = helper.addKex(k2, k_ex); int k1_b = g4_helper.addKex(k2, k_ex); int k2_b = k2; const bool conj_b = helper.extendGIndices(k1_b, k2_b, w1_b, w2_b); const bool conj_b = g4_helper.extendGIndices(k1_b, k2_b, w1_b, w2_b); const int i_b = b2 + nb * k1_b + no * w1_b; const int j_b = b4 + nb * k2_b + no * w2_b; Loading @@ -397,18 +397,18 @@ __global__ void updateG4Kernel(CudaComplex<Real>* __restrict__ G4, int w2_a(w2); int k1_a(k1); int k2_a(k2); const bool conj_a = helper.extendGIndices(k1_a, k2_a, w1_a, w2_a); const bool conj_a = g4_helper.extendGIndices(k1_a, k2_a, w1_a, w2_a); const int i_a = b1 + nb * k1_a + no * w1_a; const int j_a = b4 + nb * k2_a + no * w2_a; const CudaComplex<Real> Ga_1 = cond_conj(G_up[i_a + ldgu * j_a], conj_a); const CudaComplex<Real> Ga_2 = cond_conj(G_down[i_a + ldgd * j_a], conj_a); int w1_b(helper.addWex(w2, w_ex)); int w2_b(helper.addWex(w1, w_ex)); int k1_b = helper.addKex(k2, k_ex); int k2_b = helper.addKex(k1, k_ex); const bool conj_b = helper.extendGIndices(k1_b, k2_b, w1_b, w2_b); int w1_b(g4_helper.addWex(w2, w_ex)); int w2_b(g4_helper.addWex(w1, w_ex)); int k1_b = g4_helper.addKex(k2, k_ex); int k2_b = g4_helper.addKex(k1, k_ex); const bool conj_b = g4_helper.extendGIndices(k1_b, k2_b, w1_b, w2_b); const int i_b = b2 + nb * k1_b + no * w1_b; const int j_b = b3 + nb * k2_b + no * w2_b; Loading @@ -422,21 +422,21 @@ __global__ void updateG4Kernel(CudaComplex<Real>* __restrict__ G4, case PARTICLE_HOLE_LONGITUDINAL_UP_DOWN: { // contribution <- \sum_s G(k1, k1+k_ex, s) * G(k2+k_ex, k2, -s) int w1_a(w1); int w2_a(helper.addWex(w1, w_ex)); int w2_a(g4_helper.addWex(w1, w_ex)); int k1_a = k1; int k2_a = helper.addKex(k1, k_ex); const bool conj_a = helper.extendGIndices(k1_a, k2_a, w1_a, w2_a); int k2_a = g4_helper.addKex(k1, k_ex); const bool conj_a = g4_helper.extendGIndices(k1_a, k2_a, w1_a, w2_a); const int i_a = b1 + nb * k1_a + no * w1_a; const int j_a = b3 + nb * k2_a + no * w2_a; const CudaComplex<Real> Ga_1 = cond_conj(G_up[i_a + ldgu * j_a], conj_a); const CudaComplex<Real> Ga_2 = cond_conj(G_down[i_a + ldgd * j_a], conj_a); int w1_b(helper.addWex(w2, w_ex)); int w1_b(g4_helper.addWex(w2, w_ex)); int w2_b(w2); int k1_b = helper.addKex(k2, k_ex); int k1_b = g4_helper.addKex(k2, k_ex); int k2_b = k2; const bool conj_b = helper.extendGIndices(k1_b, k2_b, w1_b, w2_b); const bool conj_b = g4_helper.extendGIndices(k1_b, k2_b, w1_b, w2_b); const int i_b = b2 + nb * k1_b + no * w1_b; const int j_b = b4 + nb * k2_b + no * w2_b; Loading Loading @@ -537,6 +537,16 @@ template void updateG4<float, PARTICLE_HOLE_CHARGE>( const std::complex<float>* G_down, const int ldgd, const int nb, const int nk, const int nw_pos, const int nw_exchange, const int nk_exchange, const int sign, bool atomic, cudaStream_t stream); template void updateG4<float, PARTICLE_HOLE_LONGITUDINAL_UP_UP>( std::complex<float>* G4, const std::complex<float>* G_up, const int ldgu, const std::complex<float>* G_down, const int ldgd, const int nb, const int nk, const int nw_pos, const int nw_exchange, const int nk_exchange, const int sign, bool atomic, cudaStream_t stream); template void updateG4<float, PARTICLE_HOLE_LONGITUDINAL_UP_DOWN>( std::complex<float>* G4, const std::complex<float>* G_up, const int ldgu, const std::complex<float>* G_down, const int ldgd, const int nb, const int nk, const int nw_pos, const int nw_exchange, const int nk_exchange, const int sign, bool atomic, cudaStream_t stream); template void updateG4<float, PARTICLE_PARTICLE_UP_DOWN>( std::complex<float>* G4, const std::complex<float>* G_up, const int ldgu, const std::complex<float>* G_down, const int ldgd, const int nb, const int nk, const int nw_pos, Loading @@ -557,6 +567,16 @@ template void updateG4<double, PARTICLE_HOLE_CHARGE>( const std::complex<double>* G_down, const int ldgd, const int nb, const int nk, const int nw_pos, const int nw_exchange, const int nk_exchange, const int sign, bool atomic, cudaStream_t stream); template void updateG4<double, PARTICLE_HOLE_LONGITUDINAL_UP_UP>( std::complex<double>* G4, const std::complex<double>* G_up, const int ldgu, const std::complex<double>* G_down, const int ldgd, const int nb, const int nk, const int nw_pos, const int nw_exchange, const int nk_exchange, const int sign, bool atomic, cudaStream_t stream); template void updateG4<double, PARTICLE_HOLE_LONGITUDINAL_UP_DOWN>( std::complex<double>* G4, const std::complex<double>* G_up, const int ldgu, const std::complex<double>* G_down, const int ldgd, const int nb, const int nk, const int nw_pos, const int nw_exchange, const int nk_exchange, const int sign, bool atomic, cudaStream_t stream); template void updateG4<double, PARTICLE_PARTICLE_UP_DOWN>( std::complex<double>* G4, const std::complex<double>* G_up, const int ldgu, const std::complex<double>* G_down, const int ldgd, const int nb, const int nk, const int nw_pos, Loading