Unverified Commit aefe49e4 authored by Peter Doak's avatar Peter Doak Committed by GitHub
Browse files

Merge pull request #83 from CompFUSE/minor_g4_fixes

Minor g4 fixes
parents 5451cb26 40bc3a12
Loading
Loading
Loading
Loading
+17 −15
Original line number Diff line number Diff line
@@ -383,10 +383,12 @@ double TpAccumulator<Parameters, linalg::CPU>::updateG4() {

  switch (mode_) {
    case PARTICLE_HOLE_MAGNETIC:
      // Note: sums over spin indices are implied.
      //
      // G4(k1, k2, k_ex) = 1/2 (s1 * s2) <c^+(k1 + k_ex, s1) c(k1, s1)
      //                    c^+(k2, s2) c(k2 + k_ex, s2)>
      //                     = 1/2 (s1 * s2) <G(k1, k1 + k_ex, s1) G(k2 + k_ex, k2, s2) - (s1 ==s2)
      //                       G(k2 + k_ex, k1 + k_ex, s1) G(k1, k2, s1)>
      //                  = 1/2 (s1 * s2) <G(k1, k1 + k_ex, s1) G(k2 + k_ex, k2, s2) -
      //                    (s1 == s2) G(k2 + k_ex, k1 + k_ex, s1) G(k1, k2, s1)>.
      for (int w_ex_idx = 0; w_ex_idx < exchange_frq.size(); ++w_ex_idx) {
        const int w_ex = exchange_frq[w_ex_idx];
        for (int w2 = 0; w2 < WTpDmn::dmn_size(); ++w2)
@@ -412,7 +414,7 @@ double TpAccumulator<Parameters, linalg::CPU>::updateG4() {
    case PARTICLE_HOLE_CHARGE:
      // G4(k1, k2, k_ex) += 1/2  <c^+(k1 + k_ex, s1) c(k1, s1) c^+(k2, s2) c(k2 + k_ex, s2)> =
      //                  = 1/2 <G(k1, k1 + k_ex, s1) G(k2 + k_ex, k2, s2) -
      //                  (s1 ==s2) G(k2 + k_ex, k1 + k_ex, s1) G(k1, k2, s1)>
      //                    (s1 == s2) G(k2 + k_ex, k1 + k_ex, s1) G(k1, k2, s1)>.
      for (int w_ex_idx = 0; w_ex_idx < exchange_frq.size(); ++w_ex_idx) {
        const int w_ex = exchange_frq[w_ex_idx];
        for (int w2 = 0; w2 < WTpDmn::dmn_size(); ++w2)
@@ -437,8 +439,8 @@ double TpAccumulator<Parameters, linalg::CPU>::updateG4() {
      break;

    case PARTICLE_HOLE_TRANSVERSE:
      // G4 = 1/2 \sum_s <c^+(k1 + k_ex, s) c(k1, -s) c^+(k2, -s) c(k2 + k_ex, s)>
      //    = -1/2 \sum_s G( k2 + k_ex, k1 + k_ex, s) G4(k1, k2, -s)
      // G4 = 1/2 <c^+(k1 + k_ex, s) c(k1, -s) c^+(k2, -s) c(k2 + k_ex, s)>
      //    = -1/2 G(k2 + k_ex, k1 + k_ex, s) G(k1, k2, -s).
      for (int w_ex_idx = 0; w_ex_idx < exchange_frq.size(); ++w_ex_idx) {
        const int w_ex = exchange_frq[w_ex_idx];
        for (int w2 = 0; w2 < WTpDmn::dmn_size(); ++w2)
@@ -460,8 +462,8 @@ double TpAccumulator<Parameters, linalg::CPU>::updateG4() {
      break;

    case PARTICLE_PARTICLE_UP_DOWN:
      // G4 = 1/2 \sum_s <c^+(k_ex-k1, s) c^+(k1, -s) c(k2, -s) c(k_ex-k2, s)>
      //    = 1/2 \sum_s G(k_ex-k2, k_ex-k1, s) G4(k2, k1, -s)
      // G4 = 1/2 <c^+(k_ex-k1, s) c^+(k1, -s) c(k2, -s) c(k_ex-k2, s)>
      //    = 1/2 G(k_ex-k2, k_ex-k1, s) G(k2, k1, -s).
      for (int w_ex_idx = 0; w_ex_idx < exchange_frq.size(); ++w_ex_idx) {
        const int w_ex = exchange_frq[w_ex_idx];
        for (int w2 = 0; w2 < WTpDmn::dmn_size(); ++w2)
@@ -531,9 +533,9 @@ void TpAccumulator<Parameters, linalg::CPU>::updateG4SpinDifference(
    const bool cross_legs) {
  // This function performs the following update for each band:
  //
  // G4(k1, k2, w1, w2) += alpha * (G(up, k1_a, k2_a, w1_a, w2_a) * G(up, k1_b, k2_b, w1_b, w2_b)
  //                       + sign * G(down, k1_a, k2_a, w1_a, w2_a) * G(down, k1_b, k2_b, w1_b,
  //                       w2_b)
  // G4(k1, k2, w1, w2) += alpha * (G(up, k1_a, k2_a, w1_a, w2_a)
  //                       + sign * G(down,k1_a, k2_a, w1_a, w2_a)) *
  //                          (G(up,k1_b,k2_b,w1_b,w2_b) + sign * G(down,k1_b,k2_b,w1_b,w2_b))
  if (n_bands_ == 1) {
      *G4_ptr += alpha * (getGSingleband(0, k1_a, k2_a, w1_a, w2_a) +
			  Complex(sign) * getGSingleband(1, k1_a, k2_a, w1_a, w2_a)) *
+1 −1
Original line number Diff line number Diff line
@@ -99,7 +99,7 @@ void accumulateOnDevice(const double* M, const int ldm, const ScalarType sign, S
                        const ConfigElem* config_right, const ScalarType* tau,
                        const ScalarType* cubic_coeff, const int size, cudaStream_t stream_) {
  const auto& helper = HelperSelector<ScalarType>::value;
  const static int convolution_size = 2 * helper.get_oversampling() + 1;
  const int convolution_size = 2 * helper.get_oversampling() + 1;
  const auto blocks = getBlockSize(size * size * convolution_size, 128);

  // TODO: check if there is a performance gain in using a block size that is a multiple of
+17 −10
Original line number Diff line number Diff line
@@ -164,9 +164,9 @@ void computeGMultiband(std::complex<Real>* G, int ldg, const std::complex<Real>*
        return candidate;
    return -1;
  };
  const static int width = get_block_width();

  const static auto blocks = getBlockSize(n_rows, n_rows * 2, width);
  const int width = get_block_width();
  const auto blocks = getBlockSize(n_rows, n_rows * 2, width);

  computeGMultibandKernel<<<blocks[0], blocks[1], width * width * sizeof(std::complex<Real>), stream>>>(
      castCudaComplex(G), ldg, castCudaComplex(G0), ldg0, nb, nk, nw_pos, beta);
@@ -266,7 +266,11 @@ __global__ void updateG4Kernel(CudaComplex<Real>* __restrict__ G4,
      const bool conj_b = helper.extendGIndices(k1_b, k2_b,w1_b, w2_b);
      const int i_b = b2 + nb * k1_b + no * w1_b;
      const int j_b = b3 + nb * k2_b + no * w2_b;


      const CudaComplex<Real> Gb_1 = cond_conj(G_up[i_b + ldgu * j_b], conj_b);


      const CudaComplex<Real> Gb_2 = cond_conj(G_down[i_b + ldgd * j_b], conj_b);

      contribution = -(Ga_1 * Gb_1 + Ga_2 * Gb_2);
@@ -323,7 +327,10 @@ __global__ void updateG4Kernel(CudaComplex<Real>* __restrict__ G4,
      const int i_b = b2 + nb * k1_b + no * w1_b;
      const int j_b = b3 + nb * k2_b + no * w2_b;


      const CudaComplex<Real> Gb_1 = cond_conj(G_up[i_b + ldgu * j_b], conj_b);


      const CudaComplex<Real> Gb_2 = cond_conj(G_down[i_b + ldgd * j_b], conj_b);

      contribution = -(Ga_1 * Gb_1 + Ga_2 * Gb_2);
@@ -400,10 +407,10 @@ void updateG4(std::complex<Real>* G4, const std::complex<Real>* G_up, const int
              const std::complex<Real>* G_down, const int ldgd, const int nb, const int nk,
              const int nw_pos, const int nw_exchange, const int nk_exchange, const int sign,
              cudaStream_t stream) {
  const static int nw = 2 * nw_pos;
  const static int size_12 = nw * nk * nb * nb;
  const static int size_3 = nw_exchange * nk_exchange;
  const static auto blocks = getBlockSize3D(size_12, size_12, size_3);
  const int nw = 2 * nw_pos;
  const int size_12 = nw * nk * nb * nb;
  const int size_3 = nw_exchange * nk_exchange;
  const auto blocks = getBlockSize3D(size_12, size_12, size_3);

  updateG4Kernel<Real, type><<<blocks[0], blocks[1], 0, stream>>>(
      castCudaComplex(G4), castCudaComplex(G_up), ldgu, castCudaComplex(G_down), ldgd, nb, nk, nw,