Commit 44842fe2 authored by Doak, Peter W.'s avatar Doak, Peter W.
Browse files

fixing some merge goofs. 4_point_type ordering

parent 1575944c
Loading
Loading
Loading
Loading
+22 −26
Original line number Diff line number Diff line
@@ -182,7 +182,6 @@ TpAccumulator<Parameters, linalg::CPU>::TpAccumulator(
    : G0_ptr_(&G0),
      thread_id_(thread_id),
      multiple_accumulators_(pars.get_accumulators() > 1),
      mode_(pars.get_four_point_type()),
      beta_(pars.get_beta()),
      extension_index_offset_((WTpExtDmn::dmn_size() - WTpDmn::dmn_size()) / 2),
      n_pos_frqs_(WTpExtPosDmn::dmn_size()),
@@ -434,6 +433,28 @@ double TpAccumulator<Parameters, linalg::CPU>::updateG4(TpGreensFunction& G4) {
  const FourPointType channel = stringToFourPointType(channel_str);

  switch (channel) {
    case PARTICLE_HOLE_TRANSVERSE:
      // G4(k1, k2, k_ex) = 1/2 sum_s <c^+(k1+k_ex, s) c(k1, -s) c^+(k2, -s) c(k2+k_ex, s)>
      //                  = -1/2 sum_s G(k2+k_ex, k1+k_ex, s) G(k1, k2, -s)
      for (int w_ex_idx = 0; w_ex_idx < exchange_frq.size(); ++w_ex_idx) {
        const int w_ex = exchange_frq[w_ex_idx];
        for (int w2 = 0; w2 < WTpDmn::dmn_size(); ++w2)
          for (int w1 = 0; w1 < WTpDmn::dmn_size(); ++w1)
            for (int k_ex_idx = 0; k_ex_idx < exchange_mom.size(); ++k_ex_idx) {
              const int k_ex = exchange_mom[k_ex_idx];
              for (int k2 = 0; k2 < KDmn::dmn_size(); ++k2)
                for (int k1 = 0; k1 < KDmn::dmn_size(); ++k1) {
                  Complex* const G4_ptr = &G4(0, 0, 0, 0, k1, k2, k_ex_idx, w1, w2, w_ex_idx);
                  for (int s = 0; s < 2; ++s)
                    updateG4Atomic(G4_ptr, s, k1, k2, w1, w2, not s, momentum_sum(k2, k_ex),
                                   momentum_sum(k1, k_ex), w_plus_w_ex(w2, w_ex),
                                   w_plus_w_ex(w1, w_ex), -sign_over_2, true);
                }
            }
      }
      flops += n_loops * 2 * flops_update_atomic;
      break;

    case PARTICLE_HOLE_MAGNETIC:
      // G4(k1, k2, k_ex) = 1/2 sum_{s1, s2} (s1 * s2)
      //                      <c^+(k1+k_ex, s1) c(k1, s1) c^+(k2, s2) c(k2+k_ex, s2)>
@@ -517,7 +538,6 @@ double TpAccumulator<Parameters, linalg::CPU>::updateG4(TpGreensFunction& G4) {
                }
            }
      }

      flops += n_loops * 4 * flops_update_atomic;
      break;

@@ -540,33 +560,9 @@ double TpAccumulator<Parameters, linalg::CPU>::updateG4(TpGreensFunction& G4) {
                }
            }
      }

      flops += n_loops * 4 * flops_update_atomic;
      break;

    case PARTICLE_HOLE_TRANSVERSE:
      // G4(k1, k2, k_ex) = 1/2 sum_s <c^+(k1+k_ex, s) c(k1, -s) c^+(k2, -s) c(k2+k_ex, s)>
      //                  = -1/2 sum_s G(k2+k_ex, k1+k_ex, s) G(k1, k2, -s)
      for (int w_ex_idx = 0; w_ex_idx < exchange_frq.size(); ++w_ex_idx) {
        const int w_ex = exchange_frq[w_ex_idx];
        for (int w2 = 0; w2 < WTpDmn::dmn_size(); ++w2)
          for (int w1 = 0; w1 < WTpDmn::dmn_size(); ++w1)
            for (int k_ex_idx = 0; k_ex_idx < exchange_mom.size(); ++k_ex_idx) {
              const int k_ex = exchange_mom[k_ex_idx];
              for (int k2 = 0; k2 < KDmn::dmn_size(); ++k2)
                for (int k1 = 0; k1 < KDmn::dmn_size(); ++k1) {
                  Complex* const G4_ptr = &G4(0, 0, 0, 0, k1, k2, k_ex_idx, w1, w2, w_ex_idx);
                  for (int s = 0; s < 2; ++s)
                    updateG4Atomic(G4_ptr, s, k1, k2, w1, w2, not s, momentum_sum(k2, k_ex),
                                   momentum_sum(k1, k_ex), w_plus_w_ex(w2, w_ex),
                                   w_plus_w_ex(w1, w_ex), -sign_over_2, true);
                }
            }
      }

      flops += n_loops * 2 * flops_update_atomic;
      break;

    case PARTICLE_PARTICLE_UP_DOWN:
      // G4(k1, k2, k_ex) = 1/2 sum_s <c^+(k_ex-k1, s) c^+(k1, -s) c(k2, -s) c(k_ex-k2, s)>
      //                  = 1/2 sum_s G(k_ex-k2, k_ex-k1, s) G(k2, k1, -s)
+8 −8
Original line number Diff line number Diff line
@@ -395,6 +395,12 @@ void TpAccumulator<Parameters, linalg::GPU>::updateG4(const std::size_t channel_
  const FourPointType channel = stringToFourPointType(channel_str);

  switch (channel) {
    case PARTICLE_HOLE_TRANSVERSE:
      details::updateG4<Real, PARTICLE_HOLE_TRANSVERSE>(
          get_G4()[channel_index].ptr(), G_[0].ptr(), G_[0].leadingDimension(), G_[1].ptr(),
          G_[1].leadingDimension(), n_bands_, KDmn::dmn_size(), WTpPosDmn::dmn_size(), nw_exchange,
          nk_exchange, sign_, multiple_accumulators_, streams_[0]);
      break;
    case PARTICLE_HOLE_MAGNETIC:
      details::updateG4<Real, PARTICLE_HOLE_MAGNETIC>(
          get_G4()[channel_index].ptr(), G_[0].ptr(), G_[0].leadingDimension(), G_[1].ptr(),
@@ -405,13 +411,13 @@ void TpAccumulator<Parameters, linalg::GPU>::updateG4(const std::size_t channel_
      details::updateG4<Real, PARTICLE_HOLE_CHARGE>(
          get_G4()[channel_index].ptr(), G_[0].ptr(), G_[0].leadingDimension(), G_[1].ptr(),
          G_[1].leadingDimension(), n_bands_, KDmn::dmn_size(), WTpPosDmn::dmn_size(), nw_exchange,
          nk_exchange, sign_, streams_[0]);
          nk_exchange, sign_, multiple_accumulators_, streams_[0]);
      break;
    case PARTICLE_HOLE_LONGITUDINAL_UP_UP:
      details::updateG4<Real, PARTICLE_HOLE_LONGITUDINAL_UP_UP>(
          get_G4()[channel_index].ptr(), G_[0].ptr(), G_[0].leadingDimension(), G_[1].ptr(),
          G_[1].leadingDimension(), n_bands_, KDmn::dmn_size(), WTpPosDmn::dmn_size(), nw_exchange,
          nk_exchange, sign_, streams_[0]);
          nk_exchange, sign_, multiple_accumulators_, streams_[0]);
      break;
    case PARTICLE_HOLE_LONGITUDINAL_UP_DOWN:
      details::updateG4<Real, PARTICLE_HOLE_LONGITUDINAL_UP_DOWN>(
@@ -419,12 +425,6 @@ void TpAccumulator<Parameters, linalg::GPU>::updateG4(const std::size_t channel_
          G_[1].leadingDimension(), n_bands_, KDmn::dmn_size(), WTpPosDmn::dmn_size(), nw_exchange,
          nk_exchange, sign_, multiple_accumulators_, streams_[0]);
      break;
    case PARTICLE_HOLE_TRANSVERSE:
      details::updateG4<Real, PARTICLE_HOLE_TRANSVERSE>(
          get_G4()[channel_index].ptr(), G_[0].ptr(), G_[0].leadingDimension(), G_[1].ptr(),
          G_[1].leadingDimension(), n_bands_, KDmn::dmn_size(), WTpPosDmn::dmn_size(), nw_exchange,
          nk_exchange, sign_, multiple_accumulators_, streams_[0]);
      break;
    case PARTICLE_PARTICLE_UP_DOWN:
      details::updateG4<Real, PARTICLE_PARTICLE_UP_DOWN>(
          get_G4()[channel_index].ptr(), G_[0].ptr(), G_[0].leadingDimension(), G_[1].ptr(),
+38 −18
Original line number Diff line number Diff line
@@ -368,21 +368,21 @@ __global__ void updateG4Kernel(CudaComplex<Real>* __restrict__ G4,
    case PARTICLE_HOLE_LONGITUDINAL_UP_UP: {
      // contribution <- \sum_s G(k1, k1+k_ex, s) * G(k2+k_ex, k2, s)
      int w1_a(w1);
      int w2_a(helper.addWex(w1, w_ex));
      int w2_a(g4_helper.addWex(w1, w_ex));
      int k1_a = k1;
      int k2_a = helper.addKex(k1, k_ex);
      const bool conj_a = helper.extendGIndices(k1_a, k2_a, w1_a, w2_a);
      int k2_a = g4_helper.addKex(k1, k_ex);
      const bool conj_a = g4_helper.extendGIndices(k1_a, k2_a, w1_a, w2_a);
      const int i_a = b1 + nb * k1_a + no * w1_a;
      const int j_a = b3 + nb * k2_a + no * w2_a;

      const CudaComplex<Real> Ga_1 = cond_conj(G_up[i_a + ldgu * j_a], conj_a);
      const CudaComplex<Real> Ga_2 = cond_conj(G_down[i_a + ldgd * j_a], conj_a);

      int w1_b(helper.addWex(w2, w_ex));
      int w1_b(g4_helper.addWex(w2, w_ex));
      int w2_b(w2);
      int k1_b = helper.addKex(k2, k_ex);
      int k1_b = g4_helper.addKex(k2, k_ex);
      int k2_b = k2;
      const bool conj_b = helper.extendGIndices(k1_b, k2_b, w1_b, w2_b);
      const bool conj_b = g4_helper.extendGIndices(k1_b, k2_b, w1_b, w2_b);
      const int i_b = b2 + nb * k1_b + no * w1_b;
      const int j_b = b4 + nb * k2_b + no * w2_b;

@@ -397,18 +397,18 @@ __global__ void updateG4Kernel(CudaComplex<Real>* __restrict__ G4,
        int w2_a(w2);
        int k1_a(k1);
        int k2_a(k2);
        const bool conj_a = helper.extendGIndices(k1_a, k2_a, w1_a, w2_a);
        const bool conj_a = g4_helper.extendGIndices(k1_a, k2_a, w1_a, w2_a);
        const int i_a = b1 + nb * k1_a + no * w1_a;
        const int j_a = b4 + nb * k2_a + no * w2_a;

        const CudaComplex<Real> Ga_1 = cond_conj(G_up[i_a + ldgu * j_a], conj_a);
        const CudaComplex<Real> Ga_2 = cond_conj(G_down[i_a + ldgd * j_a], conj_a);

        int w1_b(helper.addWex(w2, w_ex));
        int w2_b(helper.addWex(w1, w_ex));
        int k1_b = helper.addKex(k2, k_ex);
        int k2_b = helper.addKex(k1, k_ex);
        const bool conj_b = helper.extendGIndices(k1_b, k2_b, w1_b, w2_b);
        int w1_b(g4_helper.addWex(w2, w_ex));
        int w2_b(g4_helper.addWex(w1, w_ex));
        int k1_b = g4_helper.addKex(k2, k_ex);
        int k2_b = g4_helper.addKex(k1, k_ex);
        const bool conj_b = g4_helper.extendGIndices(k1_b, k2_b, w1_b, w2_b);
        const int i_b = b2 + nb * k1_b + no * w1_b;
        const int j_b = b3 + nb * k2_b + no * w2_b;

@@ -422,21 +422,21 @@ __global__ void updateG4Kernel(CudaComplex<Real>* __restrict__ G4,
    case PARTICLE_HOLE_LONGITUDINAL_UP_DOWN: {
      // contribution <- \sum_s G(k1, k1+k_ex, s) * G(k2+k_ex, k2, -s)
      int w1_a(w1);
      int w2_a(helper.addWex(w1, w_ex));
      int w2_a(g4_helper.addWex(w1, w_ex));
      int k1_a = k1;
      int k2_a = helper.addKex(k1, k_ex);
      const bool conj_a = helper.extendGIndices(k1_a, k2_a, w1_a, w2_a);
      int k2_a = g4_helper.addKex(k1, k_ex);
      const bool conj_a = g4_helper.extendGIndices(k1_a, k2_a, w1_a, w2_a);
      const int i_a = b1 + nb * k1_a + no * w1_a;
      const int j_a = b3 + nb * k2_a + no * w2_a;

      const CudaComplex<Real> Ga_1 = cond_conj(G_up[i_a + ldgu * j_a], conj_a);
      const CudaComplex<Real> Ga_2 = cond_conj(G_down[i_a + ldgd * j_a], conj_a);

      int w1_b(helper.addWex(w2, w_ex));
      int w1_b(g4_helper.addWex(w2, w_ex));
      int w2_b(w2);
      int k1_b = helper.addKex(k2, k_ex);
      int k1_b = g4_helper.addKex(k2, k_ex);
      int k2_b = k2;
      const bool conj_b = helper.extendGIndices(k1_b, k2_b, w1_b, w2_b);
      const bool conj_b = g4_helper.extendGIndices(k1_b, k2_b, w1_b, w2_b);
      const int i_b = b2 + nb * k1_b + no * w1_b;
      const int j_b = b4 + nb * k2_b + no * w2_b;

@@ -537,6 +537,16 @@ template void updateG4<float, PARTICLE_HOLE_CHARGE>(
    const std::complex<float>* G_down, const int ldgd, const int nb, const int nk, const int nw_pos,
    const int nw_exchange, const int nk_exchange, const int sign, bool atomic, cudaStream_t stream);

template void updateG4<float, PARTICLE_HOLE_LONGITUDINAL_UP_UP>(
    std::complex<float>* G4, const std::complex<float>* G_up, const int ldgu,
    const std::complex<float>* G_down, const int ldgd, const int nb, const int nk, const int nw_pos,
    const int nw_exchange, const int nk_exchange, const int sign, bool atomic, cudaStream_t stream);

template void updateG4<float, PARTICLE_HOLE_LONGITUDINAL_UP_DOWN>(
    std::complex<float>* G4, const std::complex<float>* G_up, const int ldgu,
    const std::complex<float>* G_down, const int ldgd, const int nb, const int nk, const int nw_pos,
    const int nw_exchange, const int nk_exchange, const int sign, bool atomic, cudaStream_t stream);

template void updateG4<float, PARTICLE_PARTICLE_UP_DOWN>(
    std::complex<float>* G4, const std::complex<float>* G_up, const int ldgu,
    const std::complex<float>* G_down, const int ldgd, const int nb, const int nk, const int nw_pos,
@@ -557,6 +567,16 @@ template void updateG4<double, PARTICLE_HOLE_CHARGE>(
    const std::complex<double>* G_down, const int ldgd, const int nb, const int nk, const int nw_pos,
    const int nw_exchange, const int nk_exchange, const int sign, bool atomic, cudaStream_t stream);

template void updateG4<double, PARTICLE_HOLE_LONGITUDINAL_UP_UP>(
    std::complex<double>* G4, const std::complex<double>* G_up, const int ldgu,
    const std::complex<double>* G_down, const int ldgd, const int nb, const int nk, const int nw_pos,
    const int nw_exchange, const int nk_exchange, const int sign, bool atomic, cudaStream_t stream);

template void updateG4<double, PARTICLE_HOLE_LONGITUDINAL_UP_DOWN>(
    std::complex<double>* G4, const std::complex<double>* G_up, const int ldgu,
    const std::complex<double>* G_down, const int ldgd, const int nb, const int nk, const int nw_pos,
    const int nw_exchange, const int nk_exchange, const int sign, bool atomic, cudaStream_t stream);

template void updateG4<double, PARTICLE_PARTICLE_UP_DOWN>(
    std::complex<double>* G4, const std::complex<double>* G_up, const int ldgu,
    const std::complex<double>* G_down, const int ldgd, const int nb, const int nk, const int nw_pos,