Commit ddf28be3 authored by Doak, Peter W.'s avatar Doak, Peter W.
Browse files

working (for Rashba) complex g0 G4 GPU

parent 71f005e2
Loading
Loading
Loading
Loading
+8 −2
Original line number Diff line number Diff line
@@ -41,8 +41,14 @@ void computeGMultiband(std::complex<Real>* G, int ldg, const std::complex<Real>*

// Updates G4 in the range [start, end)
template <typename Scalar, FourPointType type, typename SignType>
double updateG4(Scalar* G4, const Scalar* G_dn, const int ldgd,
               const Scalar* G_up, const int ldgu, const SignType factor,
double updateG4(Scalar* G4, const Scalar* G_up, const int ldgu,
               const Scalar* G_dn, const int ldgd, const SignType factor,
               bool atomic, cudaStream_t stream, std::size_t start,
               std::size_t end);

// Updates G4 in the range [start, end)
template <typename Scalar, FourPointType type, typename SignType>
double updateG4NoSpin(Scalar* G4, const Scalar* G_up, const int ldgu, const SignType factor,
               bool atomic, cudaStream_t stream, std::size_t start,
               std::size_t end);

+48 −38
Original line number Diff line number Diff line
@@ -195,6 +195,7 @@ protected:
  using Base::G4_;
  using Base::multiple_accumulators_;
  using Base::n_bands_;

  // using Base::n_pos_frqs_;

  using Base::non_density_density_;
@@ -250,7 +251,9 @@ protected:
#endif

#ifndef NDEBUG
  std::array<linalg::ReshapableMatrix<TpComplex, linalg::CPU, dca::linalg::util::PinnedAllocator<TpComplex>>,2> G_debug_;
  std::array<
      linalg::ReshapableMatrix<TpComplex, linalg::CPU, dca::linalg::util::PinnedAllocator<TpComplex>>, 2>
      G_debug_;
#endif

#ifndef DCA_HAVE_GPU_AWARE_MPI
@@ -307,7 +310,6 @@ void TpAccumulator<Parameters, DT, linalg::GPU>::resetG4() {
      G4_channel.setStream(reset_stream);
      G4_channel.resizeNoCopy(G4_[0].size());
      G4_channel.setToZero(reset_stream);
      
    }
    catch (std::bad_alloc& err) {
      std::cerr << "Failed to allocate G4 on device.\n";
@@ -419,6 +421,7 @@ double TpAccumulator<Parameters, DT, linalg::GPU>::updateG4(const std::size_t ch
  uint64_t start = Base::G4_[0].get_start();
  uint64_t end =
      Base::G4_[0].get_end() + 1;  // because the kernel expects this to be one past the end index
  if constexpr (Base::spin_symmetric_) {
    switch (channel) {
      case FourPointType::PARTICLE_HOLE_TRANSVERSE:
        return details::updateG4<TpComplex, FourPointType::PARTICLE_HOLE_TRANSVERSE>(
@@ -454,6 +457,14 @@ double TpAccumulator<Parameters, DT, linalg::GPU>::updateG4(const std::size_t ch
        throw std::logic_error("Specified four point type not implemented by tp_accumulator_gpu.");
    }
  }
  else {
    if (channel == FourPointType::PARTICLE_PARTICLE_UP_DOWN )
      return details::updateG4NoSpin<TpComplex, FourPointType::PARTICLE_PARTICLE_UP_DOWN>(
          get_G4Dev()[channel_index].ptr(), G_[0].ptr(), G_[0].leadingDimension(), factor,
          multiple_accumulators_, queues_[0], start, end);
  }
  throw std::logic_error("Specified four point type not implemented by tp_accumulator_gpu.");
}

template <class Parameters, DistType DT>
void TpAccumulator<Parameters, DT, linalg::GPU>::finalize() {
@@ -498,8 +509,7 @@ const std::vector<typename TpAccumulator<Parameters, DT, linalg::GPU>::Base::TpG

#ifndef NDEBUG
template <class Parameters, DistType DT>
const auto& TpAccumulator<
    Parameters, DT, linalg::GPU>::get_G_Debug() {
const auto& TpAccumulator<Parameters, DT, linalg::GPU>::get_G_Debug() {
  if (G_debug_.empty())
    throw std::logic_error("There is no G4 stored in this class.");

+169 −0
Original line number Diff line number Diff line
@@ -745,6 +745,162 @@ double updateG4(Scalar* G4, const Scalar* G_up, const int ldgu, const Scalar* G_
  }
}
  
template <typename Scalar, FourPointType type, typename SignType>
__global__ void updateG4KernelNoSpin(CudaComplex<RealAlias<Scalar>>* __restrict__ G4,
                                     const CudaComplex<RealAlias<Scalar>>* __restrict__ G_up,
                                     const int ldgu,
				     const SignType factor, const bool atomic,
                                     const uint64_t start, const uint64_t end) {
  // TODO: reduce code duplication.
  // TODO: decrease, if possible, register pressure. E.g. a single thread computes all bands.

  const uint64_t local_g4_index =
      static_cast<uint64_t>(blockIdx.x) * static_cast<uint64_t>(blockDim.x) +
      static_cast<uint64_t>(threadIdx.x);

  const uint64_t g4_index = local_g4_index + start;

  if (g4_index >= end) {  // out of domain.
    return;
  }

  Scalar complex_factor;
  dca::linalg::assign(complex_factor, factor);
  const Scalar sign_over_2 = 0.5 * complex_factor;

  int b1, b2, b3, b4, k1, k2, k_ex, w1, w2, w_ex;
  g4_helper.unrollIndex(g4_index, b1, b2, b3, b4, k1, w1, k2, w2, k_ex, w_ex);

  const int nb = g4_helper.get_bands();
  const int nk = g4_helper.get_cluster_size();

  CudaComplex<RealAlias<Scalar>> contribution;
  const unsigned no = nk * nb;

  // This code needs to be repeated over and over.  This happens in getGMultiband in the cpu
  // implementation. The gpu code is structed differently so without signficant restructing this
  // can't happen in the extendGIndiciesMultiBand routines.
  auto condSwapAdd = [](int& ia, int& ib, const int ba, const int bb, const bool cond) {
    if (cond) {
      ia += bb;
      ib += ba;
    }
    else {
      ia += ba;
      ib += bb;
    }
  };
  // Compute the contribution to G4. In all the products of Green's function of type Ga * Gb,
  // the dependency on the bands is implied as Ga(b1, b2) * Gb(b2, b3). Sums and differences with
  // the exchange momentum, implies the same operation is performed with the exchange frequency.
  // See tp_accumulator.hpp for more details.
  if constexpr (type == FourPointType::PARTICLE_PARTICLE_UP_DOWN) {
    {
      int w1_a(w1);
      int w2_a(w2);
      int k1_a(k1);
      int k2_a(k2);
      g4_helper.extendGIndicesMultiBand(k1_a, k2_a, w1_a, w2_a);

      int w1_b(g4_helper.wexMinus(w1, w_ex));
      int w2_b(g4_helper.wexMinus(w2, w_ex));
      int k1_b = g4_helper.kexMinus(k1, k_ex);
      int k2_b = g4_helper.kexMinus(k2, k_ex);
      g4_helper.extendGIndicesMultiBand(k1_b, k2_b, w1_b, w2_b);

      int i_a = nb * k1_a + no * w1_a;
      int j_a = nb * k2_a + no * w2_a;
      condSwapAdd(i_a, j_a, b1, b3, true);

      int i_b = nb * k1_b + no * w1_b;
      int j_b = nb * k2_b + no * w2_b;
      condSwapAdd(i_b, j_b, b2, b4, true);

      const CudaComplex<RealAlias<Scalar>> Ga_1 = G_up[i_a + ldgu * j_a];
      const CudaComplex<RealAlias<Scalar>> Gb_1 = G_up[i_b + ldgu * j_b];

      contribution = complex_factor * (Ga_1 * Gb_1);
    }
    {
      int w1_a(w1);
      int w2_a(g4_helper.wexMinus(w2, w_ex));
      int k1_a(k1);
      int k2_a(g4_helper.kexMinus(k2, k_ex));
      g4_helper.extendGIndicesMultiBand(k1_a, k2_a, w1_a, w2_a);

      int w1_b(g4_helper.wexMinus(w1, w_ex));
      int w2_b(w2);
      int k1_b(g4_helper.kexMinus(k1, k_ex));
      int k2_b(k2);
      g4_helper.extendGIndicesMultiBand(k1_b, k2_b, w1_b, w2_b);

      int i_a = nb * k1_a + no * w1_a;
      int j_a = nb * k2_a + no * w2_a;
      condSwapAdd(i_a, j_a, b1, b4, true);

      int i_b = nb * k1_b + no * w1_b;
      int j_b = nb * k2_b + no * w2_b;
      condSwapAdd(i_b, j_b, b2, b3, true);

      const CudaComplex<RealAlias<Scalar>> Ga_1 = G_up[i_a + ldgu * j_a];
      const CudaComplex<RealAlias<Scalar>> Gb_1 = G_up[i_b + ldgu * j_b];

      contribution -= complex_factor * (Ga_1 * Gb_1);
    }
  }
  decltype(G4) const result_ptr = G4 + local_g4_index;
  if (atomic)
    dca::linalg::atomicAdd(result_ptr, contribution);
  else
    *result_ptr += contribution;
}

template <typename Scalar, FourPointType type, typename SignType>
double updateG4NoSpin(Scalar* G4, const Scalar* G_up, const int ldgu, const SignType factor, bool atomic, cudaStream_t stream,
                std::size_t start, std::size_t end) {
  constexpr const std::size_t n_threads = 256;
  const unsigned n_blocks = dca::util::ceilDiv(end - start, n_threads);

  using dca::util::GPUTypeConversion;
  updateG4KernelNoSpin<dca::util::CUDATypeMap<Scalar>, type><<<n_blocks, n_threads, 0, stream>>>(
      castGPUType(G4), castGPUType(G_up), ldgu,
      GPUTypeConversion(factor), atomic, start, end);

  // Check for errors.
  auto err = cudaPeekAtLastError();
  if (err != cudaSuccess) {
    linalg::util::printErrorMessage(err, __FUNCTION__, __FILE__, __LINE__);
    throw(std::runtime_error("CUDA failed to launch the G4 kernel."));
  }

  const std::size_t n_updates = end - start;
  switch (type) {
      // Note: sign flips  are ignored and a single complex * real multiplication is
      // present in all modes.
    case FourPointType::PARTICLE_HOLE_TRANSVERSE:
      // Each update of a G4 entry involves 2 complex additions and 2 complex multiplications.
      return 18. * n_updates;
    case FourPointType::PARTICLE_HOLE_MAGNETIC:
      // Each update of a G4 entry involves 3 complex additions and 3 complex multiplications.
      return 26. * n_updates;
    case FourPointType::PARTICLE_HOLE_CHARGE:
      // Each update of a G4 entry involves 3 complex additions and 3 complex multiplications.
      return 26. * n_updates;
    case FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_UP:
      // Each update of a G4 entry involves 3 complex additions and 4 complex multiplications.
      return 32 * n_updates;
    case FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_DOWN:
      // Each update of a G4 entry involves 2 complex additions and 2 complex multiplications.
      return 18. * n_updates;
    case FourPointType::PARTICLE_PARTICLE_UP_DOWN:
      // Each update of a G4 entry involves 2 complex additions and 2 complex multiplications.
      return 18. * n_updates;
    default:
      throw(std::logic_error("Invalid mode"));
  }
}

  
// Explicit instantiation.
template void computeGSingleband<float>(std::complex<float>* G, int ldg,
                                        const std::complex<float>* G0, int nk, int nw,
@@ -902,6 +1058,19 @@ double updateG4<std::complex<double>, FourPointType::PARTICLE_PARTICLE_UP_DOWN,
    const std::complex<double>* G_down, const int ldgd, const std::complex<double> factor,
    bool atomic, cudaStream_t stream, std::size_t start, std::size_t end);

// Non spin symmetric
template double updateG4NoSpin<std::complex<float>, FourPointType::PARTICLE_PARTICLE_UP_DOWN, std::complex<float>>(
    std::complex<float>* G4, const std::complex<float>* G_up, const int ldgu,
    const std::complex<float> factor, bool atomic,
    cudaStream_t stream, std::size_t start, std::size_t end);

template double updateG4NoSpin<std::complex<double>, FourPointType::PARTICLE_PARTICLE_UP_DOWN, std::complex<double>>(
    std::complex<double>* G4, const std::complex<double>* G_up, const int ldgu,
    const std::complex<double> factor, bool atomic,
    cudaStream_t stream, std::size_t start, std::size_t end);



// template<> double updateG4< FourPointType::PARTICLE_HOLE_TRANSVERSE>(
//   std::complex<float>* G4, const std::complex<float>* G_up, const int ldgu,
//   const std::complex<float>* G_down, const int ldgd, const std::int8_t factor, bool atomic,
+2 −2
Original line number Diff line number Diff line
@@ -58,8 +58,8 @@ dca_add_gtest(tp_accumulator_gpu_test
  )

dca_add_gtest(tp_accumulator_complex_g0_gpu_test
  FAST
  CUDA
        GTEST_MAIN
        INCLUDE_DIRS ${DCA_INCLUDE_DIRS};${PROJECT_SOURCE_DIR}
        LIBS     ${DCA_LIBS} ${KERNELS_LIB}
        )
+156 −18
Original line number Diff line number Diff line
@@ -40,6 +40,21 @@ using McOptions = MockMcOptions<Scalar>;

constexpr bool update_baseline = false;

constexpr bool write_G4s = true;

#ifdef DCA_HAVE_ADIOS2
adios2::ADIOS* adios_ptr;
#endif

#ifdef DCA_HAVE_MPI
#include "dca/parallel/mpi_concurrency/mpi_concurrency.hpp"
dca::parallel::MPIConcurrency* concurrency_ptr;
#else
#include "dca/parallel/no_concurrency/no_concurrency.hpp"
dca::parallel::NoConcurrency* concurrency_ptr;
#endif


#define INPUT_DIR \
  DCA_SOURCE_DIR "/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/"

@@ -49,29 +64,55 @@ using ConfigGenerator = dca::testing::AccumulationTest<std::complex<double>>;
using Configuration = ConfigGenerator::Configuration;
using Sample = ConfigGenerator::Sample;

using TpAccumulatorComplexG0GpuTest =
  dca::testing::G0Setup<Scalar, dca::testing::LatticeRashba, dca::ClusterSolverId::CT_AUX, input_file>;
template <typename SCALAR>
struct TpAccumulatorComplexG0GpuTest : public ::testing::Test {
  using G0Setup = dca::testing::G0SetupBare<SCALAR, dca::testing::LatticeRashba,
                                            dca::ClusterSolverId::CT_AUX, input_file>;
  virtual void SetUp() {
    host_setup.SetUp();
    gpu_setup.SetUp();
  }

  virtual void TearDown() {}
  G0Setup host_setup;
  G0Setup gpu_setup;
};


uint loop_counter = 0;

TEST_F(TpAccumulatorComplexG0GpuTest, Accumulate) {
  dca::linalg::util::initializeMagma();
using TestTypes = ::testing::Types<std::complex<double>>;
TYPED_TEST_CASE(TpAccumulatorComplexG0GpuTest, TestTypes);

#define TYPING_PREFACE                                            \
  using Scalar = TypeParam;                                       \
  using ConfigGenerator = dca::testing::AccumulationTest<Scalar>; \
  using Configuration = typename ConfigGenerator::Configuration;  \
  using Sample = typename ConfigGenerator::Sample;

  const std::array<int, 2> n{35, 0};
TYPED_TEST(TpAccumulatorComplexG0GpuTest, Accumulate) {
  TYPING_PREFACE

  const std::array<int, 2> n{18, 22};
  Sample M;
  Configuration config;
  ConfigGenerator::prepareConfiguration(config, M, TpAccumulatorComplexG0GpuTest::BDmn::dmn_size(),
                                        TpAccumulatorComplexG0GpuTest::RDmn::dmn_size(),
                                        parameters_.get_beta(), n);
  using FourPointType = dca::phys::FourPointType;
  
  ConfigGenerator::prepareConfiguration(config, M, TpAccumulatorComplexG0GpuTest<Scalar>::G0Setup::BDmn::dmn_size(),
                                        TpAccumulatorComplexG0GpuTest<Scalar>::G0Setup::RDmn::dmn_size(),
                                        this->host_setup.parameters_.get_beta(), n);
  std::vector<FourPointType> four_point_channels{
       FourPointType::PARTICLE_PARTICLE_UP_DOWN};

  using namespace dca::phys;
  parameters_.set_four_point_channels(std::vector<FourPointType>{FourPointType::PARTICLE_PARTICLE_UP_DOWN});
  this->host_setup.parameters_.set_four_point_channels(four_point_channels);
  this->gpu_setup.parameters_.set_four_point_channels(four_point_channels);

  dca::phys::solver::accumulator::TpAccumulator<Parameters, dca::DistType::NONE, dca::linalg::CPU> accumulatorHost(
      data_->G0_k_w_cluster_excluded, parameters_);
  dca::phys::solver::accumulator::TpAccumulator<Parameters, dca::DistType::NONE, dca::linalg::GPU> accumulatorDevice(
      data_->G0_k_w_cluster_excluded, parameters_);
  const int8_t sign = 1;
  dca::phys::solver::accumulator::TpAccumulator<decltype(this->host_setup.parameters_), dca::DistType::NONE, dca::linalg::CPU> accumulatorHost(
      this->host_setup.data_->G0_k_w_cluster_excluded, this->host_setup.parameters_);
  dca::phys::solver::accumulator::TpAccumulator<decltype(this->gpu_setup.parameters_), dca::DistType::NONE, dca::linalg::GPU> accumulatorDevice(
      this->gpu_setup.data_->G0_k_w_cluster_excluded, this->gpu_setup.parameters_);
  const std::complex<double> sign = {1.0, 0.0};

  accumulatorDevice.resetAccumulation(loop_counter);
  accumulatorDevice.accumulate(M, config, sign);
@@ -83,10 +124,107 @@ TEST_F(TpAccumulatorComplexG0GpuTest, Accumulate) {

  ++loop_counter;

  for (std::size_t channel = 0; channel < accumulatorHost.num_channels(); ++channel) {
    const auto diff = dca::func::util::difference(accumulatorHost.get_G4()[channel],
  #ifdef DCA_HAVE_ADIOS2
  if (write_G4s) {
    dca::io::Writer writer(*adios_ptr, *concurrency_ptr, "ADIOS2", true);
    dca::io::Writer writer_h5(*adios_ptr, *concurrency_ptr, "HDF5", true);

    writer.open_file("tp_gpu_test_complex_G0_G4.bp");
    writer_h5.open_file("tp_gpu_test_complex_G0_G4.hdf5");

    this->host_setup.parameters_.write(writer);
    this->host_setup.parameters_.write(writer_h5);
    this->host_setup.data_->write(writer);
    this->host_setup.data_->write(writer_h5);

    for (std::size_t channel = 0; channel < accumulatorHost.get_G4().size(); ++channel) {
      std::string channel_str =
          dca::phys::toString(this->host_setup.parameters_.get_four_point_channels()[channel]);
      writer.execute("accumulatorHOST_" + channel_str, accumulatorHost.get_G4()[channel]);
      writer.execute("accumulatorDevice_" + channel_str, accumulatorDevice.get_G4()[channel]);
      writer_h5.execute("accumulatorHOST_" + channel_str, accumulatorHost.get_G4()[channel]);
      writer_h5.execute("accumulatorDevice_" + channel_str, accumulatorDevice.get_G4()[channel]);
    }
    writer_h5.execute("accumulatorDevice_G_0", accumulatorDevice.get_G_Debug()[0]);
    writer_h5.execute("accumulatorDevice_G_1", accumulatorDevice.get_G_Debug()[1]);
    writer_h5.execute("accumulatorHOST_G", accumulatorHost.get_G_Debug());

#ifndef NDEBUG
    const auto& G_up = accumulatorDevice.get_G_Debug()[0];
    const auto& G_down = accumulatorDevice.get_G_Debug()[1];
    using Parameters = decltype(this->host_setup.parameters_);
    using TpComplex = typename decltype(accumulatorDevice)::TpComplex;
    using HostSpinSepG = dca::linalg::ReshapableMatrix<TpComplex, dca::linalg::CPU,
                                                       dca::linalg::util::PinnedAllocator<TpComplex>>;
    std::array<HostSpinSepG, 2> G_spin_separated{G_up.size(), G_down.size()};

    using WTpExtDmn = dca::func::dmn_0<domains::vertex_frequency_domain<domains::EXTENDED>>;
    using KDmn = typename Parameters::KClusterDmn;
    using BDmn = dca::func::dmn_0<domains::electron_band_domain>;
    using SDmn = dca::func::dmn_0<domains::electron_spin_domain>;
    auto& g_all = accumulatorHost.get_G_Debug();

    for (int spin = 0; spin < SDmn::dmn_size(); ++spin) {
      auto& g_this_spin = G_spin_separated[spin];
      auto g_it = g_this_spin.begin();
      for (int w1 = 0; w1 < WTpExtDmn::dmn_size(); ++w1)
        for (int k1 = 0; k1 < KDmn::dmn_size(); ++k1)
          for (int b1 = 0; b1 < BDmn::dmn_size(); ++b1)
            for (int w2 = 0; w2 < WTpExtDmn::dmn_size(); ++w2)
              for (int k2 = 0; k2 < KDmn::dmn_size(); ++k2)
                for (int b2 = 0; b2 < BDmn::dmn_size(); ++b2, ++g_it)
                  *g_it = g_all(b1, b2, spin, k2, k1, w2, w1);
    }

    writer_h5.execute("accumulatorHOST_G_0", G_spin_separated[0]);
    writer_h5.execute("accumulatorHOST_G_1", G_spin_separated[1]);

    for (int i = 0; i < G_up.size().first; ++i)
      for (int j = 0; j < G_up.size().second; ++j) {
        EXPECT_NEAR(G_up(i, j).real(), G_spin_separated[0](i, j).real(), 1E-12) << "( " << i << ", " << j << " )";
        EXPECT_NEAR(G_up(i, j).imag(), G_spin_separated[0](i, j).imag(), 1E-12) << "( " << i << ", " << j << " )";
        EXPECT_NEAR(G_down(i, j).real(), G_spin_separated[1](i, j).real(), 1E-12) << "( " << i << ", " << j << " )";
	EXPECT_NEAR(G_down(i, j).imag(), G_spin_separated[1](i, j).imag(), 1E-12) << "( " << i << ", " << j << " )";
      }
#endif

    writer.close_file();
    writer_h5.close_file();
  }
#endif

  std::cout << "blocks: " << dca::util::ceilDiv(int(accumulatorHost.get_G4()[0].size()), 256) << '\n';

  for (std::size_t channel = 0; channel < accumulatorHost.get_G4().size(); ++channel) {
    auto diff = dca::func::util::difference(accumulatorHost.get_G4()[channel],
                                            accumulatorDevice.get_G4()[channel]);
    EXPECT_GT(5e-7, diff.l_inf);
    EXPECT_GT(5e-7, diff.l_inf) << "channel: " << dca::phys::toString(four_point_channels[channel]);
  }
}


int main(int argc, char** argv) {
#ifdef DCA_HAVE_MPI
  dca::parallel::MPIConcurrency concurrency(argc, argv);
  concurrency_ptr = &concurrency;
#else
  dca::parallel::NoConcurrency concurrency(argc, argv);
  concurrency_ptr = &concurrency;
#endif

  dca::linalg::util::initializeMagma();

#ifdef DCA_HAVE_ADIOS2
  // ADIOS expects MPI_COMM pointer or nullptr
  adios2::ADIOS adios("", concurrency_ptr->get(), false);
  adios_ptr = &adios;
#endif
  ::testing::InitGoogleTest(&argc, argv);

  // ::testing::TestEventListeners& listeners = ::testing::UnitTest::GetInstance()->listeners();
  // delete listeners.Release(listeners.default_result_printer());
  // listeners.Append(new dca::testing::MinimalistPrinter);

  int result = RUN_ALL_TESTS();
  return result;
}
+2 −2

File changed.

Contains only whitespace changes.

Loading