Loading include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator.hpp +17 −15 Original line number Diff line number Diff line Loading @@ -383,10 +383,12 @@ double TpAccumulator<Parameters, linalg::CPU>::updateG4() { switch (mode_) { case PARTICLE_HOLE_MAGNETIC: // Note: sums over spin indices are implied. // // G4(k1, k2, k_ex) = 1/2 (s1 * s2) <c^+(k1 + k_ex, s1) c(k1, s1) // c^+(k2, s2) c(k2 + k_ex, s2)> // = 1/2 (s1 * s2) <G(k1, k1 + k_ex, s1) G(k2 + k_ex, k2, s2) - (s1 ==s2) // G(k2 + k_ex, k1 + k_ex, s1) G(k1, k2, s1)> // = 1/2 (s1 * s2) <G(k1, k1 + k_ex, s1) G(k2 + k_ex, k2, s2) - // (s1 == s2) G(k2 + k_ex, k1 + k_ex, s1) G(k1, k2, s1)>. for (int w_ex_idx = 0; w_ex_idx < exchange_frq.size(); ++w_ex_idx) { const int w_ex = exchange_frq[w_ex_idx]; for (int w2 = 0; w2 < WTpDmn::dmn_size(); ++w2) Loading @@ -412,7 +414,7 @@ double TpAccumulator<Parameters, linalg::CPU>::updateG4() { case PARTICLE_HOLE_CHARGE: // G4(k1, k2, k_ex) += 1/2 <c^+(k1 + k_ex, s1) c(k1, s1) c^+(k2, s2) c(k2 + k_ex, s2)> = // = 1/2 <G(k1, k1 + k_ex, s1) G(k2 + k_ex, k2, s2) - // (s1 ==s2) G(k2 + k_ex, k1 + k_ex, s1) G(k1, k2, s1)> // (s1 == s2) G(k2 + k_ex, k1 + k_ex, s1) G(k1, k2, s1)>. for (int w_ex_idx = 0; w_ex_idx < exchange_frq.size(); ++w_ex_idx) { const int w_ex = exchange_frq[w_ex_idx]; for (int w2 = 0; w2 < WTpDmn::dmn_size(); ++w2) Loading @@ -437,8 +439,8 @@ double TpAccumulator<Parameters, linalg::CPU>::updateG4() { break; case PARTICLE_HOLE_TRANSVERSE: // G4 = 1/2 \sum_s <c^+(k1 + k_ex, s) c(k1, -s) c^+(k2, -s) c(k2 + k_ex, s)> // = -1/2 \sum_s G( k2 + k_ex, k1 + k_ex, s) G4(k1, k2, -s) // G4 = 1/2 <c^+(k1 + k_ex, s) c(k1, -s) c^+(k2, -s) c(k2 + k_ex, s)> // = -1/2 G(k2 + k_ex, k1 + k_ex, s) G(k1, k2, -s). for (int w_ex_idx = 0; w_ex_idx < exchange_frq.size(); ++w_ex_idx) { const int w_ex = exchange_frq[w_ex_idx]; for (int w2 = 0; w2 < WTpDmn::dmn_size(); ++w2) Loading @@ -460,8 +462,8 @@ double TpAccumulator<Parameters, linalg::CPU>::updateG4() { break; case PARTICLE_PARTICLE_UP_DOWN: // G4 = 1/2 \sum_s <c^+(k_ex-k1, s) c^+(k1, -s) c(k2, -s) c(k_ex-k2, s)> // = 1/2 \sum_s G(k_ex-k2, k_ex-k1, s) G4(k2, k1, -s) // G4 = 1/2 <c^+(k_ex-k1, s) c^+(k1, -s) c(k2, -s) c(k_ex-k2, s)> // = 1/2 G(k_ex-k2, k_ex-k1, s) G(k2, k1, -s). for (int w_ex_idx = 0; w_ex_idx < exchange_frq.size(); ++w_ex_idx) { const int w_ex = exchange_frq[w_ex_idx]; for (int w2 = 0; w2 < WTpDmn::dmn_size(); ++w2) Loading Loading @@ -531,9 +533,9 @@ void TpAccumulator<Parameters, linalg::CPU>::updateG4SpinDifference( const bool cross_legs) { // This function performs the following update for each band: // // G4(k1, k2, w1, w2) += alpha * (G(up, k1_a, k2_a, w1_a, w2_a) * G(up, k1_b, k2_b, w1_b, w2_b) // + sign * G(down, k1_a, k2_a, w1_a, w2_a) * G(down, k1_b, k2_b, w1_b, // w2_b) // G4(k1, k2, w1, w2) += alpha * (G(up, k1_a, k2_a, w1_a, w2_a) // + sign * G(down,k1_a, k2_a, w1_a, w2_a)) * // (G(up,k1_b,k2_b,w1_b,w2_b) + sign * G(down,k1_b,k2_b,w1_b,w2_b)) if (n_bands_ == 1) { *G4_ptr += alpha * (getGSingleband(0, k1_a, k2_a, w1_a, w2_a) + Complex(sign) * getGSingleband(1, k1_a, k2_a, w1_a, w2_a)) * Loading src/math/nfft/dnfft_1d_kernels.cu +1 −1 Original line number Diff line number Diff line Loading @@ -99,7 +99,7 @@ void accumulateOnDevice(const double* M, const int ldm, const ScalarType sign, S const ConfigElem* config_right, const ScalarType* tau, const ScalarType* cubic_coeff, const int size, cudaStream_t stream_) { const auto& helper = HelperSelector<ScalarType>::value; const static int convolution_size = 2 * helper.get_oversampling() + 1; const int convolution_size = 2 * helper.get_oversampling() + 1; const auto blocks = getBlockSize(size * size * convolution_size, 128); // TODO: check if there is a performance gain in using a block size that is a multiple of Loading src/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_kernels.cu +17 −10 Original line number Diff line number Diff line Loading @@ -164,9 +164,9 @@ void computeGMultiband(std::complex<Real>* G, int ldg, const std::complex<Real>* return candidate; return -1; }; const static int width = get_block_width(); const static auto blocks = getBlockSize(n_rows, n_rows * 2, width); const int width = get_block_width(); const auto blocks = getBlockSize(n_rows, n_rows * 2, width); computeGMultibandKernel<<<blocks[0], blocks[1], width * width * sizeof(std::complex<Real>), stream>>>( castCudaComplex(G), ldg, castCudaComplex(G0), ldg0, nb, nk, nw_pos, beta); Loading Loading @@ -266,7 +266,11 @@ __global__ void updateG4Kernel(CudaComplex<Real>* __restrict__ G4, const bool conj_b = helper.extendGIndices(k1_b, k2_b,w1_b, w2_b); const int i_b = b2 + nb * k1_b + no * w1_b; const int j_b = b3 + nb * k2_b + no * w2_b; const CudaComplex<Real> Gb_1 = cond_conj(G_up[i_b + ldgu * j_b], conj_b); const CudaComplex<Real> Gb_2 = cond_conj(G_down[i_b + ldgd * j_b], conj_b); contribution = -(Ga_1 * Gb_1 + Ga_2 * Gb_2); Loading Loading @@ -323,7 +327,10 @@ __global__ void updateG4Kernel(CudaComplex<Real>* __restrict__ G4, const int i_b = b2 + nb * k1_b + no * w1_b; const int j_b = b3 + nb * k2_b + no * w2_b; const CudaComplex<Real> Gb_1 = cond_conj(G_up[i_b + ldgu * j_b], conj_b); const CudaComplex<Real> Gb_2 = cond_conj(G_down[i_b + ldgd * j_b], conj_b); contribution = -(Ga_1 * Gb_1 + Ga_2 * Gb_2); Loading Loading @@ -400,10 +407,10 @@ void updateG4(std::complex<Real>* G4, const std::complex<Real>* G_up, const int const std::complex<Real>* G_down, const int ldgd, const int nb, const int nk, const int nw_pos, const int nw_exchange, const int nk_exchange, const int sign, cudaStream_t stream) { const static int nw = 2 * nw_pos; const static int size_12 = nw * nk * nb * nb; const static int size_3 = nw_exchange * nk_exchange; const static auto blocks = getBlockSize3D(size_12, size_12, size_3); const int nw = 2 * nw_pos; const int size_12 = nw * nk * nb * nb; const int size_3 = nw_exchange * nk_exchange; const auto blocks = getBlockSize3D(size_12, size_12, size_3); updateG4Kernel<Real, type><<<blocks[0], blocks[1], 0, stream>>>( castCudaComplex(G4), castCudaComplex(G_up), ldgu, castCudaComplex(G_down), ldgd, nb, nk, nw, Loading Loading
include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator.hpp +17 −15 Original line number Diff line number Diff line Loading @@ -383,10 +383,12 @@ double TpAccumulator<Parameters, linalg::CPU>::updateG4() { switch (mode_) { case PARTICLE_HOLE_MAGNETIC: // Note: sums over spin indices are implied. // // G4(k1, k2, k_ex) = 1/2 (s1 * s2) <c^+(k1 + k_ex, s1) c(k1, s1) // c^+(k2, s2) c(k2 + k_ex, s2)> // = 1/2 (s1 * s2) <G(k1, k1 + k_ex, s1) G(k2 + k_ex, k2, s2) - (s1 ==s2) // G(k2 + k_ex, k1 + k_ex, s1) G(k1, k2, s1)> // = 1/2 (s1 * s2) <G(k1, k1 + k_ex, s1) G(k2 + k_ex, k2, s2) - // (s1 == s2) G(k2 + k_ex, k1 + k_ex, s1) G(k1, k2, s1)>. for (int w_ex_idx = 0; w_ex_idx < exchange_frq.size(); ++w_ex_idx) { const int w_ex = exchange_frq[w_ex_idx]; for (int w2 = 0; w2 < WTpDmn::dmn_size(); ++w2) Loading @@ -412,7 +414,7 @@ double TpAccumulator<Parameters, linalg::CPU>::updateG4() { case PARTICLE_HOLE_CHARGE: // G4(k1, k2, k_ex) += 1/2 <c^+(k1 + k_ex, s1) c(k1, s1) c^+(k2, s2) c(k2 + k_ex, s2)> = // = 1/2 <G(k1, k1 + k_ex, s1) G(k2 + k_ex, k2, s2) - // (s1 ==s2) G(k2 + k_ex, k1 + k_ex, s1) G(k1, k2, s1)> // (s1 == s2) G(k2 + k_ex, k1 + k_ex, s1) G(k1, k2, s1)>. for (int w_ex_idx = 0; w_ex_idx < exchange_frq.size(); ++w_ex_idx) { const int w_ex = exchange_frq[w_ex_idx]; for (int w2 = 0; w2 < WTpDmn::dmn_size(); ++w2) Loading @@ -437,8 +439,8 @@ double TpAccumulator<Parameters, linalg::CPU>::updateG4() { break; case PARTICLE_HOLE_TRANSVERSE: // G4 = 1/2 \sum_s <c^+(k1 + k_ex, s) c(k1, -s) c^+(k2, -s) c(k2 + k_ex, s)> // = -1/2 \sum_s G( k2 + k_ex, k1 + k_ex, s) G4(k1, k2, -s) // G4 = 1/2 <c^+(k1 + k_ex, s) c(k1, -s) c^+(k2, -s) c(k2 + k_ex, s)> // = -1/2 G(k2 + k_ex, k1 + k_ex, s) G(k1, k2, -s). for (int w_ex_idx = 0; w_ex_idx < exchange_frq.size(); ++w_ex_idx) { const int w_ex = exchange_frq[w_ex_idx]; for (int w2 = 0; w2 < WTpDmn::dmn_size(); ++w2) Loading @@ -460,8 +462,8 @@ double TpAccumulator<Parameters, linalg::CPU>::updateG4() { break; case PARTICLE_PARTICLE_UP_DOWN: // G4 = 1/2 \sum_s <c^+(k_ex-k1, s) c^+(k1, -s) c(k2, -s) c(k_ex-k2, s)> // = 1/2 \sum_s G(k_ex-k2, k_ex-k1, s) G4(k2, k1, -s) // G4 = 1/2 <c^+(k_ex-k1, s) c^+(k1, -s) c(k2, -s) c(k_ex-k2, s)> // = 1/2 G(k_ex-k2, k_ex-k1, s) G(k2, k1, -s). for (int w_ex_idx = 0; w_ex_idx < exchange_frq.size(); ++w_ex_idx) { const int w_ex = exchange_frq[w_ex_idx]; for (int w2 = 0; w2 < WTpDmn::dmn_size(); ++w2) Loading Loading @@ -531,9 +533,9 @@ void TpAccumulator<Parameters, linalg::CPU>::updateG4SpinDifference( const bool cross_legs) { // This function performs the following update for each band: // // G4(k1, k2, w1, w2) += alpha * (G(up, k1_a, k2_a, w1_a, w2_a) * G(up, k1_b, k2_b, w1_b, w2_b) // + sign * G(down, k1_a, k2_a, w1_a, w2_a) * G(down, k1_b, k2_b, w1_b, // w2_b) // G4(k1, k2, w1, w2) += alpha * (G(up, k1_a, k2_a, w1_a, w2_a) // + sign * G(down,k1_a, k2_a, w1_a, w2_a)) * // (G(up,k1_b,k2_b,w1_b,w2_b) + sign * G(down,k1_b,k2_b,w1_b,w2_b)) if (n_bands_ == 1) { *G4_ptr += alpha * (getGSingleband(0, k1_a, k2_a, w1_a, w2_a) + Complex(sign) * getGSingleband(1, k1_a, k2_a, w1_a, w2_a)) * Loading
src/math/nfft/dnfft_1d_kernels.cu +1 −1 Original line number Diff line number Diff line Loading @@ -99,7 +99,7 @@ void accumulateOnDevice(const double* M, const int ldm, const ScalarType sign, S const ConfigElem* config_right, const ScalarType* tau, const ScalarType* cubic_coeff, const int size, cudaStream_t stream_) { const auto& helper = HelperSelector<ScalarType>::value; const static int convolution_size = 2 * helper.get_oversampling() + 1; const int convolution_size = 2 * helper.get_oversampling() + 1; const auto blocks = getBlockSize(size * size * convolution_size, 128); // TODO: check if there is a performance gain in using a block size that is a multiple of Loading
src/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_kernels.cu +17 −10 Original line number Diff line number Diff line Loading @@ -164,9 +164,9 @@ void computeGMultiband(std::complex<Real>* G, int ldg, const std::complex<Real>* return candidate; return -1; }; const static int width = get_block_width(); const static auto blocks = getBlockSize(n_rows, n_rows * 2, width); const int width = get_block_width(); const auto blocks = getBlockSize(n_rows, n_rows * 2, width); computeGMultibandKernel<<<blocks[0], blocks[1], width * width * sizeof(std::complex<Real>), stream>>>( castCudaComplex(G), ldg, castCudaComplex(G0), ldg0, nb, nk, nw_pos, beta); Loading Loading @@ -266,7 +266,11 @@ __global__ void updateG4Kernel(CudaComplex<Real>* __restrict__ G4, const bool conj_b = helper.extendGIndices(k1_b, k2_b,w1_b, w2_b); const int i_b = b2 + nb * k1_b + no * w1_b; const int j_b = b3 + nb * k2_b + no * w2_b; const CudaComplex<Real> Gb_1 = cond_conj(G_up[i_b + ldgu * j_b], conj_b); const CudaComplex<Real> Gb_2 = cond_conj(G_down[i_b + ldgd * j_b], conj_b); contribution = -(Ga_1 * Gb_1 + Ga_2 * Gb_2); Loading Loading @@ -323,7 +327,10 @@ __global__ void updateG4Kernel(CudaComplex<Real>* __restrict__ G4, const int i_b = b2 + nb * k1_b + no * w1_b; const int j_b = b3 + nb * k2_b + no * w2_b; const CudaComplex<Real> Gb_1 = cond_conj(G_up[i_b + ldgu * j_b], conj_b); const CudaComplex<Real> Gb_2 = cond_conj(G_down[i_b + ldgd * j_b], conj_b); contribution = -(Ga_1 * Gb_1 + Ga_2 * Gb_2); Loading Loading @@ -400,10 +407,10 @@ void updateG4(std::complex<Real>* G4, const std::complex<Real>* G_up, const int const std::complex<Real>* G_down, const int ldgd, const int nb, const int nk, const int nw_pos, const int nw_exchange, const int nk_exchange, const int sign, cudaStream_t stream) { const static int nw = 2 * nw_pos; const static int size_12 = nw * nk * nb * nb; const static int size_3 = nw_exchange * nk_exchange; const static auto blocks = getBlockSize3D(size_12, size_12, size_3); const int nw = 2 * nw_pos; const int size_12 = nw * nk * nb * nb; const int size_3 = nw_exchange * nk_exchange; const auto blocks = getBlockSize3D(size_12, size_12, size_3); updateG4Kernel<Real, type><<<blocks[0], blocks[1], 0, stream>>>( castCudaComplex(G4), castCudaComplex(G_up), ldgu, castCudaComplex(G_down), ldgd, nb, nk, nw, Loading