Loading include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/kernels_interface.hpp +8 −2 Original line number Diff line number Diff line Loading @@ -41,8 +41,14 @@ void computeGMultiband(std::complex<Real>* G, int ldg, const std::complex<Real>* // Updates G4 in the range [start, end) template <typename Scalar, FourPointType type, typename SignType> double updateG4(Scalar* G4, const Scalar* G_dn, const int ldgd, const Scalar* G_up, const int ldgu, const SignType factor, double updateG4(Scalar* G4, const Scalar* G_up, const int ldgu, const Scalar* G_dn, const int ldgd, const SignType factor, bool atomic, cudaStream_t stream, std::size_t start, std::size_t end); // Updates G4 in the range [start, end) template <typename Scalar, FourPointType type, typename SignType> double updateG4NoSpin(Scalar* G4, const Scalar* G_up, const int ldgu, const SignType factor, bool atomic, cudaStream_t stream, std::size_t start, std::size_t end); Loading include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_gpu.hpp +48 −38 Original line number Diff line number Diff line Loading @@ -195,6 +195,7 @@ protected: using Base::G4_; using Base::multiple_accumulators_; using Base::n_bands_; // using Base::n_pos_frqs_; using Base::non_density_density_; Loading Loading @@ -250,7 +251,9 @@ protected: #endif #ifndef NDEBUG std::array<linalg::ReshapableMatrix<TpComplex, linalg::CPU, dca::linalg::util::PinnedAllocator<TpComplex>>,2> G_debug_; std::array< linalg::ReshapableMatrix<TpComplex, linalg::CPU, dca::linalg::util::PinnedAllocator<TpComplex>>, 2> G_debug_; #endif #ifndef DCA_HAVE_GPU_AWARE_MPI Loading Loading @@ -307,7 +310,6 @@ void TpAccumulator<Parameters, DT, linalg::GPU>::resetG4() { G4_channel.setStream(reset_stream); G4_channel.resizeNoCopy(G4_[0].size()); G4_channel.setToZero(reset_stream); } catch (std::bad_alloc& err) { std::cerr << "Failed to allocate G4 on device.\n"; Loading Loading @@ -419,6 +421,7 @@ double TpAccumulator<Parameters, DT, linalg::GPU>::updateG4(const std::size_t ch uint64_t start = Base::G4_[0].get_start(); uint64_t end = Base::G4_[0].get_end() + 1; // because the kernel expects this to be one past the end index if constexpr (Base::spin_symmetric_) { switch (channel) { case FourPointType::PARTICLE_HOLE_TRANSVERSE: return details::updateG4<TpComplex, FourPointType::PARTICLE_HOLE_TRANSVERSE>( Loading Loading @@ -454,6 +457,14 @@ double TpAccumulator<Parameters, DT, linalg::GPU>::updateG4(const std::size_t ch throw std::logic_error("Specified four point type not implemented by tp_accumulator_gpu."); } } else { if (channel == FourPointType::PARTICLE_PARTICLE_UP_DOWN ) return details::updateG4NoSpin<TpComplex, FourPointType::PARTICLE_PARTICLE_UP_DOWN>( get_G4Dev()[channel_index].ptr(), G_[0].ptr(), G_[0].leadingDimension(), factor, multiple_accumulators_, queues_[0], start, end); } throw std::logic_error("Specified four point type not implemented by tp_accumulator_gpu."); } template <class Parameters, DistType DT> void TpAccumulator<Parameters, DT, linalg::GPU>::finalize() { Loading Loading @@ -498,8 +509,7 @@ const std::vector<typename TpAccumulator<Parameters, DT, linalg::GPU>::Base::TpG #ifndef NDEBUG template <class Parameters, DistType DT> const auto& TpAccumulator< Parameters, DT, linalg::GPU>::get_G_Debug() { const auto& TpAccumulator<Parameters, DT, linalg::GPU>::get_G_Debug() { if (G_debug_.empty()) throw std::logic_error("There is no G4 stored in this class."); Loading src/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_kernels.cu +169 −0 Original line number Diff line number Diff line Loading @@ -745,6 +745,162 @@ double updateG4(Scalar* G4, const Scalar* G_up, const int ldgu, const Scalar* G_ } } template <typename Scalar, FourPointType type, typename SignType> __global__ void updateG4KernelNoSpin(CudaComplex<RealAlias<Scalar>>* __restrict__ G4, const CudaComplex<RealAlias<Scalar>>* __restrict__ G_up, const int ldgu, const SignType factor, const bool atomic, const uint64_t start, const uint64_t end) { // TODO: reduce code duplication. // TODO: decrease, if possible, register pressure. E.g. a single thread computes all bands. const uint64_t local_g4_index = static_cast<uint64_t>(blockIdx.x) * static_cast<uint64_t>(blockDim.x) + static_cast<uint64_t>(threadIdx.x); const uint64_t g4_index = local_g4_index + start; if (g4_index >= end) { // out of domain. return; } Scalar complex_factor; dca::linalg::assign(complex_factor, factor); const Scalar sign_over_2 = 0.5 * complex_factor; int b1, b2, b3, b4, k1, k2, k_ex, w1, w2, w_ex; g4_helper.unrollIndex(g4_index, b1, b2, b3, b4, k1, w1, k2, w2, k_ex, w_ex); const int nb = g4_helper.get_bands(); const int nk = g4_helper.get_cluster_size(); CudaComplex<RealAlias<Scalar>> contribution; const unsigned no = nk * nb; // This code needs to be repeated over and over. This happens in getGMultiband in the cpu // implementation. The gpu code is structed differently so without signficant restructing this // can't happen in the extendGIndiciesMultiBand routines. auto condSwapAdd = [](int& ia, int& ib, const int ba, const int bb, const bool cond) { if (cond) { ia += bb; ib += ba; } else { ia += ba; ib += bb; } }; // Compute the contribution to G4. In all the products of Green's function of type Ga * Gb, // the dependency on the bands is implied as Ga(b1, b2) * Gb(b2, b3). Sums and differences with // the exchange momentum, implies the same operation is performed with the exchange frequency. // See tp_accumulator.hpp for more details. if constexpr (type == FourPointType::PARTICLE_PARTICLE_UP_DOWN) { { int w1_a(w1); int w2_a(w2); int k1_a(k1); int k2_a(k2); g4_helper.extendGIndicesMultiBand(k1_a, k2_a, w1_a, w2_a); int w1_b(g4_helper.wexMinus(w1, w_ex)); int w2_b(g4_helper.wexMinus(w2, w_ex)); int k1_b = g4_helper.kexMinus(k1, k_ex); int k2_b = g4_helper.kexMinus(k2, k_ex); g4_helper.extendGIndicesMultiBand(k1_b, k2_b, w1_b, w2_b); int i_a = nb * k1_a + no * w1_a; int j_a = nb * k2_a + no * w2_a; condSwapAdd(i_a, j_a, b1, b3, true); int i_b = nb * k1_b + no * w1_b; int j_b = nb * k2_b + no * w2_b; condSwapAdd(i_b, j_b, b2, b4, true); const CudaComplex<RealAlias<Scalar>> Ga_1 = G_up[i_a + ldgu * j_a]; const CudaComplex<RealAlias<Scalar>> Gb_1 = G_up[i_b + ldgu * j_b]; contribution = complex_factor * (Ga_1 * Gb_1); } { int w1_a(w1); int w2_a(g4_helper.wexMinus(w2, w_ex)); int k1_a(k1); int k2_a(g4_helper.kexMinus(k2, k_ex)); g4_helper.extendGIndicesMultiBand(k1_a, k2_a, w1_a, w2_a); int w1_b(g4_helper.wexMinus(w1, w_ex)); int w2_b(w2); int k1_b(g4_helper.kexMinus(k1, k_ex)); int k2_b(k2); g4_helper.extendGIndicesMultiBand(k1_b, k2_b, w1_b, w2_b); int i_a = nb * k1_a + no * w1_a; int j_a = nb * k2_a + no * w2_a; condSwapAdd(i_a, j_a, b1, b4, true); int i_b = nb * k1_b + no * w1_b; int j_b = nb * k2_b + no * w2_b; condSwapAdd(i_b, j_b, b2, b3, true); const CudaComplex<RealAlias<Scalar>> Ga_1 = G_up[i_a + ldgu * j_a]; const CudaComplex<RealAlias<Scalar>> Gb_1 = G_up[i_b + ldgu * j_b]; contribution -= complex_factor * (Ga_1 * Gb_1); } } decltype(G4) const result_ptr = G4 + local_g4_index; if (atomic) dca::linalg::atomicAdd(result_ptr, contribution); else *result_ptr += contribution; } template <typename Scalar, FourPointType type, typename SignType> double updateG4NoSpin(Scalar* G4, const Scalar* G_up, const int ldgu, const SignType factor, bool atomic, cudaStream_t stream, std::size_t start, std::size_t end) { constexpr const std::size_t n_threads = 256; const unsigned n_blocks = dca::util::ceilDiv(end - start, n_threads); using dca::util::GPUTypeConversion; updateG4KernelNoSpin<dca::util::CUDATypeMap<Scalar>, type><<<n_blocks, n_threads, 0, stream>>>( castGPUType(G4), castGPUType(G_up), ldgu, GPUTypeConversion(factor), atomic, start, end); // Check for errors. auto err = cudaPeekAtLastError(); if (err != cudaSuccess) { linalg::util::printErrorMessage(err, __FUNCTION__, __FILE__, __LINE__); throw(std::runtime_error("CUDA failed to launch the G4 kernel.")); } const std::size_t n_updates = end - start; switch (type) { // Note: sign flips are ignored and a single complex * real multiplication is // present in all modes. case FourPointType::PARTICLE_HOLE_TRANSVERSE: // Each update of a G4 entry involves 2 complex additions and 2 complex multiplications. return 18. * n_updates; case FourPointType::PARTICLE_HOLE_MAGNETIC: // Each update of a G4 entry involves 3 complex additions and 3 complex multiplications. return 26. * n_updates; case FourPointType::PARTICLE_HOLE_CHARGE: // Each update of a G4 entry involves 3 complex additions and 3 complex multiplications. return 26. * n_updates; case FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_UP: // Each update of a G4 entry involves 3 complex additions and 4 complex multiplications. return 32 * n_updates; case FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_DOWN: // Each update of a G4 entry involves 2 complex additions and 2 complex multiplications. return 18. * n_updates; case FourPointType::PARTICLE_PARTICLE_UP_DOWN: // Each update of a G4 entry involves 2 complex additions and 2 complex multiplications. return 18. * n_updates; default: throw(std::logic_error("Invalid mode")); } } // Explicit instantiation. template void computeGSingleband<float>(std::complex<float>* G, int ldg, const std::complex<float>* G0, int nk, int nw, Loading Loading @@ -902,6 +1058,19 @@ double updateG4<std::complex<double>, FourPointType::PARTICLE_PARTICLE_UP_DOWN, const std::complex<double>* G_down, const int ldgd, const std::complex<double> factor, bool atomic, cudaStream_t stream, std::size_t start, std::size_t end); // Non spin symmetric template double updateG4NoSpin<std::complex<float>, FourPointType::PARTICLE_PARTICLE_UP_DOWN, std::complex<float>>( std::complex<float>* G4, const std::complex<float>* G_up, const int ldgu, const std::complex<float> factor, bool atomic, cudaStream_t stream, std::size_t start, std::size_t end); template double updateG4NoSpin<std::complex<double>, FourPointType::PARTICLE_PARTICLE_UP_DOWN, std::complex<double>>( std::complex<double>* G4, const std::complex<double>* G_up, const int ldgu, const std::complex<double> factor, bool atomic, cudaStream_t stream, std::size_t start, std::size_t end); // template<> double updateG4< FourPointType::PARTICLE_HOLE_TRANSVERSE>( // std::complex<float>* G4, const std::complex<float>* G_up, const int ldgu, // const std::complex<float>* G_down, const int ldgd, const std::int8_t factor, bool atomic, Loading test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/CMakeLists.txt +2 −2 Original line number Diff line number Diff line Loading @@ -58,8 +58,8 @@ dca_add_gtest(tp_accumulator_gpu_test ) dca_add_gtest(tp_accumulator_complex_g0_gpu_test FAST CUDA GTEST_MAIN INCLUDE_DIRS ${DCA_INCLUDE_DIRS};${PROJECT_SOURCE_DIR} LIBS ${DCA_LIBS} ${KERNELS_LIB} ) Loading test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_complex_g0_gpu_test.cpp +156 −18 Original line number Diff line number Diff line Loading @@ -40,6 +40,21 @@ using McOptions = MockMcOptions<Scalar>; constexpr bool update_baseline = false; constexpr bool write_G4s = true; #ifdef DCA_HAVE_ADIOS2 adios2::ADIOS* adios_ptr; #endif #ifdef DCA_HAVE_MPI #include "dca/parallel/mpi_concurrency/mpi_concurrency.hpp" dca::parallel::MPIConcurrency* concurrency_ptr; #else #include "dca/parallel/no_concurrency/no_concurrency.hpp" dca::parallel::NoConcurrency* concurrency_ptr; #endif #define INPUT_DIR \ DCA_SOURCE_DIR "/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/" Loading @@ -49,29 +64,55 @@ using ConfigGenerator = dca::testing::AccumulationTest<std::complex<double>>; using Configuration = ConfigGenerator::Configuration; using Sample = ConfigGenerator::Sample; using TpAccumulatorComplexG0GpuTest = dca::testing::G0Setup<Scalar, dca::testing::LatticeRashba, dca::ClusterSolverId::CT_AUX, input_file>; template <typename SCALAR> struct TpAccumulatorComplexG0GpuTest : public ::testing::Test { using G0Setup = dca::testing::G0SetupBare<SCALAR, dca::testing::LatticeRashba, dca::ClusterSolverId::CT_AUX, input_file>; virtual void SetUp() { host_setup.SetUp(); gpu_setup.SetUp(); } virtual void TearDown() {} G0Setup host_setup; G0Setup gpu_setup; }; uint loop_counter = 0; TEST_F(TpAccumulatorComplexG0GpuTest, Accumulate) { dca::linalg::util::initializeMagma(); using TestTypes = ::testing::Types<std::complex<double>>; TYPED_TEST_CASE(TpAccumulatorComplexG0GpuTest, TestTypes); #define TYPING_PREFACE \ using Scalar = TypeParam; \ using ConfigGenerator = dca::testing::AccumulationTest<Scalar>; \ using Configuration = typename ConfigGenerator::Configuration; \ using Sample = typename ConfigGenerator::Sample; const std::array<int, 2> n{35, 0}; TYPED_TEST(TpAccumulatorComplexG0GpuTest, Accumulate) { TYPING_PREFACE const std::array<int, 2> n{18, 22}; Sample M; Configuration config; ConfigGenerator::prepareConfiguration(config, M, TpAccumulatorComplexG0GpuTest::BDmn::dmn_size(), TpAccumulatorComplexG0GpuTest::RDmn::dmn_size(), parameters_.get_beta(), n); using FourPointType = dca::phys::FourPointType; ConfigGenerator::prepareConfiguration(config, M, TpAccumulatorComplexG0GpuTest<Scalar>::G0Setup::BDmn::dmn_size(), TpAccumulatorComplexG0GpuTest<Scalar>::G0Setup::RDmn::dmn_size(), this->host_setup.parameters_.get_beta(), n); std::vector<FourPointType> four_point_channels{ FourPointType::PARTICLE_PARTICLE_UP_DOWN}; using namespace dca::phys; parameters_.set_four_point_channels(std::vector<FourPointType>{FourPointType::PARTICLE_PARTICLE_UP_DOWN}); this->host_setup.parameters_.set_four_point_channels(four_point_channels); this->gpu_setup.parameters_.set_four_point_channels(four_point_channels); dca::phys::solver::accumulator::TpAccumulator<Parameters, dca::DistType::NONE, dca::linalg::CPU> accumulatorHost( data_->G0_k_w_cluster_excluded, parameters_); dca::phys::solver::accumulator::TpAccumulator<Parameters, dca::DistType::NONE, dca::linalg::GPU> accumulatorDevice( data_->G0_k_w_cluster_excluded, parameters_); const int8_t sign = 1; dca::phys::solver::accumulator::TpAccumulator<decltype(this->host_setup.parameters_), dca::DistType::NONE, dca::linalg::CPU> accumulatorHost( this->host_setup.data_->G0_k_w_cluster_excluded, this->host_setup.parameters_); dca::phys::solver::accumulator::TpAccumulator<decltype(this->gpu_setup.parameters_), dca::DistType::NONE, dca::linalg::GPU> accumulatorDevice( this->gpu_setup.data_->G0_k_w_cluster_excluded, this->gpu_setup.parameters_); const std::complex<double> sign = {1.0, 0.0}; accumulatorDevice.resetAccumulation(loop_counter); accumulatorDevice.accumulate(M, config, sign); Loading @@ -83,10 +124,107 @@ TEST_F(TpAccumulatorComplexG0GpuTest, Accumulate) { ++loop_counter; for (std::size_t channel = 0; channel < accumulatorHost.num_channels(); ++channel) { const auto diff = dca::func::util::difference(accumulatorHost.get_G4()[channel], #ifdef DCA_HAVE_ADIOS2 if (write_G4s) { dca::io::Writer writer(*adios_ptr, *concurrency_ptr, "ADIOS2", true); dca::io::Writer writer_h5(*adios_ptr, *concurrency_ptr, "HDF5", true); writer.open_file("tp_gpu_test_complex_G0_G4.bp"); writer_h5.open_file("tp_gpu_test_complex_G0_G4.hdf5"); this->host_setup.parameters_.write(writer); this->host_setup.parameters_.write(writer_h5); this->host_setup.data_->write(writer); this->host_setup.data_->write(writer_h5); for (std::size_t channel = 0; channel < accumulatorHost.get_G4().size(); ++channel) { std::string channel_str = dca::phys::toString(this->host_setup.parameters_.get_four_point_channels()[channel]); writer.execute("accumulatorHOST_" + channel_str, accumulatorHost.get_G4()[channel]); writer.execute("accumulatorDevice_" + channel_str, accumulatorDevice.get_G4()[channel]); writer_h5.execute("accumulatorHOST_" + channel_str, accumulatorHost.get_G4()[channel]); writer_h5.execute("accumulatorDevice_" + channel_str, accumulatorDevice.get_G4()[channel]); } writer_h5.execute("accumulatorDevice_G_0", accumulatorDevice.get_G_Debug()[0]); writer_h5.execute("accumulatorDevice_G_1", accumulatorDevice.get_G_Debug()[1]); writer_h5.execute("accumulatorHOST_G", accumulatorHost.get_G_Debug()); #ifndef NDEBUG const auto& G_up = accumulatorDevice.get_G_Debug()[0]; const auto& G_down = accumulatorDevice.get_G_Debug()[1]; using Parameters = decltype(this->host_setup.parameters_); using TpComplex = typename decltype(accumulatorDevice)::TpComplex; using HostSpinSepG = dca::linalg::ReshapableMatrix<TpComplex, dca::linalg::CPU, dca::linalg::util::PinnedAllocator<TpComplex>>; std::array<HostSpinSepG, 2> G_spin_separated{G_up.size(), G_down.size()}; using WTpExtDmn = dca::func::dmn_0<domains::vertex_frequency_domain<domains::EXTENDED>>; using KDmn = typename Parameters::KClusterDmn; using BDmn = dca::func::dmn_0<domains::electron_band_domain>; using SDmn = dca::func::dmn_0<domains::electron_spin_domain>; auto& g_all = accumulatorHost.get_G_Debug(); for (int spin = 0; spin < SDmn::dmn_size(); ++spin) { auto& g_this_spin = G_spin_separated[spin]; auto g_it = g_this_spin.begin(); for (int w1 = 0; w1 < WTpExtDmn::dmn_size(); ++w1) for (int k1 = 0; k1 < KDmn::dmn_size(); ++k1) for (int b1 = 0; b1 < BDmn::dmn_size(); ++b1) for (int w2 = 0; w2 < WTpExtDmn::dmn_size(); ++w2) for (int k2 = 0; k2 < KDmn::dmn_size(); ++k2) for (int b2 = 0; b2 < BDmn::dmn_size(); ++b2, ++g_it) *g_it = g_all(b1, b2, spin, k2, k1, w2, w1); } writer_h5.execute("accumulatorHOST_G_0", G_spin_separated[0]); writer_h5.execute("accumulatorHOST_G_1", G_spin_separated[1]); for (int i = 0; i < G_up.size().first; ++i) for (int j = 0; j < G_up.size().second; ++j) { EXPECT_NEAR(G_up(i, j).real(), G_spin_separated[0](i, j).real(), 1E-12) << "( " << i << ", " << j << " )"; EXPECT_NEAR(G_up(i, j).imag(), G_spin_separated[0](i, j).imag(), 1E-12) << "( " << i << ", " << j << " )"; EXPECT_NEAR(G_down(i, j).real(), G_spin_separated[1](i, j).real(), 1E-12) << "( " << i << ", " << j << " )"; EXPECT_NEAR(G_down(i, j).imag(), G_spin_separated[1](i, j).imag(), 1E-12) << "( " << i << ", " << j << " )"; } #endif writer.close_file(); writer_h5.close_file(); } #endif std::cout << "blocks: " << dca::util::ceilDiv(int(accumulatorHost.get_G4()[0].size()), 256) << '\n'; for (std::size_t channel = 0; channel < accumulatorHost.get_G4().size(); ++channel) { auto diff = dca::func::util::difference(accumulatorHost.get_G4()[channel], accumulatorDevice.get_G4()[channel]); EXPECT_GT(5e-7, diff.l_inf); EXPECT_GT(5e-7, diff.l_inf) << "channel: " << dca::phys::toString(four_point_channels[channel]); } } int main(int argc, char** argv) { #ifdef DCA_HAVE_MPI dca::parallel::MPIConcurrency concurrency(argc, argv); concurrency_ptr = &concurrency; #else dca::parallel::NoConcurrency concurrency(argc, argv); concurrency_ptr = &concurrency; #endif dca::linalg::util::initializeMagma(); #ifdef DCA_HAVE_ADIOS2 // ADIOS expects MPI_COMM pointer or nullptr adios2::ADIOS adios("", concurrency_ptr->get(), false); adios_ptr = &adios; #endif ::testing::InitGoogleTest(&argc, argv); // ::testing::TestEventListeners& listeners = ::testing::UnitTest::GetInstance()->listeners(); // delete listeners.Release(listeners.default_result_printer()); // listeners.Append(new dca::testing::MinimalistPrinter); int result = RUN_ALL_TESTS(); return result; } include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_cpu.hpp +2 −2 File changed.Contains only whitespace changes. Show changes Loading
include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/kernels_interface.hpp +8 −2 Original line number Diff line number Diff line Loading @@ -41,8 +41,14 @@ void computeGMultiband(std::complex<Real>* G, int ldg, const std::complex<Real>* // Updates G4 in the range [start, end) template <typename Scalar, FourPointType type, typename SignType> double updateG4(Scalar* G4, const Scalar* G_dn, const int ldgd, const Scalar* G_up, const int ldgu, const SignType factor, double updateG4(Scalar* G4, const Scalar* G_up, const int ldgu, const Scalar* G_dn, const int ldgd, const SignType factor, bool atomic, cudaStream_t stream, std::size_t start, std::size_t end); // Updates G4 in the range [start, end) template <typename Scalar, FourPointType type, typename SignType> double updateG4NoSpin(Scalar* G4, const Scalar* G_up, const int ldgu, const SignType factor, bool atomic, cudaStream_t stream, std::size_t start, std::size_t end); Loading
include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_gpu.hpp +48 −38 Original line number Diff line number Diff line Loading @@ -195,6 +195,7 @@ protected: using Base::G4_; using Base::multiple_accumulators_; using Base::n_bands_; // using Base::n_pos_frqs_; using Base::non_density_density_; Loading Loading @@ -250,7 +251,9 @@ protected: #endif #ifndef NDEBUG std::array<linalg::ReshapableMatrix<TpComplex, linalg::CPU, dca::linalg::util::PinnedAllocator<TpComplex>>,2> G_debug_; std::array< linalg::ReshapableMatrix<TpComplex, linalg::CPU, dca::linalg::util::PinnedAllocator<TpComplex>>, 2> G_debug_; #endif #ifndef DCA_HAVE_GPU_AWARE_MPI Loading Loading @@ -307,7 +310,6 @@ void TpAccumulator<Parameters, DT, linalg::GPU>::resetG4() { G4_channel.setStream(reset_stream); G4_channel.resizeNoCopy(G4_[0].size()); G4_channel.setToZero(reset_stream); } catch (std::bad_alloc& err) { std::cerr << "Failed to allocate G4 on device.\n"; Loading Loading @@ -419,6 +421,7 @@ double TpAccumulator<Parameters, DT, linalg::GPU>::updateG4(const std::size_t ch uint64_t start = Base::G4_[0].get_start(); uint64_t end = Base::G4_[0].get_end() + 1; // because the kernel expects this to be one past the end index if constexpr (Base::spin_symmetric_) { switch (channel) { case FourPointType::PARTICLE_HOLE_TRANSVERSE: return details::updateG4<TpComplex, FourPointType::PARTICLE_HOLE_TRANSVERSE>( Loading Loading @@ -454,6 +457,14 @@ double TpAccumulator<Parameters, DT, linalg::GPU>::updateG4(const std::size_t ch throw std::logic_error("Specified four point type not implemented by tp_accumulator_gpu."); } } else { if (channel == FourPointType::PARTICLE_PARTICLE_UP_DOWN ) return details::updateG4NoSpin<TpComplex, FourPointType::PARTICLE_PARTICLE_UP_DOWN>( get_G4Dev()[channel_index].ptr(), G_[0].ptr(), G_[0].leadingDimension(), factor, multiple_accumulators_, queues_[0], start, end); } throw std::logic_error("Specified four point type not implemented by tp_accumulator_gpu."); } template <class Parameters, DistType DT> void TpAccumulator<Parameters, DT, linalg::GPU>::finalize() { Loading Loading @@ -498,8 +509,7 @@ const std::vector<typename TpAccumulator<Parameters, DT, linalg::GPU>::Base::TpG #ifndef NDEBUG template <class Parameters, DistType DT> const auto& TpAccumulator< Parameters, DT, linalg::GPU>::get_G_Debug() { const auto& TpAccumulator<Parameters, DT, linalg::GPU>::get_G_Debug() { if (G_debug_.empty()) throw std::logic_error("There is no G4 stored in this class."); Loading
src/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_kernels.cu +169 −0 Original line number Diff line number Diff line Loading @@ -745,6 +745,162 @@ double updateG4(Scalar* G4, const Scalar* G_up, const int ldgu, const Scalar* G_ } } template <typename Scalar, FourPointType type, typename SignType> __global__ void updateG4KernelNoSpin(CudaComplex<RealAlias<Scalar>>* __restrict__ G4, const CudaComplex<RealAlias<Scalar>>* __restrict__ G_up, const int ldgu, const SignType factor, const bool atomic, const uint64_t start, const uint64_t end) { // TODO: reduce code duplication. // TODO: decrease, if possible, register pressure. E.g. a single thread computes all bands. const uint64_t local_g4_index = static_cast<uint64_t>(blockIdx.x) * static_cast<uint64_t>(blockDim.x) + static_cast<uint64_t>(threadIdx.x); const uint64_t g4_index = local_g4_index + start; if (g4_index >= end) { // out of domain. return; } Scalar complex_factor; dca::linalg::assign(complex_factor, factor); const Scalar sign_over_2 = 0.5 * complex_factor; int b1, b2, b3, b4, k1, k2, k_ex, w1, w2, w_ex; g4_helper.unrollIndex(g4_index, b1, b2, b3, b4, k1, w1, k2, w2, k_ex, w_ex); const int nb = g4_helper.get_bands(); const int nk = g4_helper.get_cluster_size(); CudaComplex<RealAlias<Scalar>> contribution; const unsigned no = nk * nb; // This code needs to be repeated over and over. This happens in getGMultiband in the cpu // implementation. The gpu code is structed differently so without signficant restructing this // can't happen in the extendGIndiciesMultiBand routines. auto condSwapAdd = [](int& ia, int& ib, const int ba, const int bb, const bool cond) { if (cond) { ia += bb; ib += ba; } else { ia += ba; ib += bb; } }; // Compute the contribution to G4. In all the products of Green's function of type Ga * Gb, // the dependency on the bands is implied as Ga(b1, b2) * Gb(b2, b3). Sums and differences with // the exchange momentum, implies the same operation is performed with the exchange frequency. // See tp_accumulator.hpp for more details. if constexpr (type == FourPointType::PARTICLE_PARTICLE_UP_DOWN) { { int w1_a(w1); int w2_a(w2); int k1_a(k1); int k2_a(k2); g4_helper.extendGIndicesMultiBand(k1_a, k2_a, w1_a, w2_a); int w1_b(g4_helper.wexMinus(w1, w_ex)); int w2_b(g4_helper.wexMinus(w2, w_ex)); int k1_b = g4_helper.kexMinus(k1, k_ex); int k2_b = g4_helper.kexMinus(k2, k_ex); g4_helper.extendGIndicesMultiBand(k1_b, k2_b, w1_b, w2_b); int i_a = nb * k1_a + no * w1_a; int j_a = nb * k2_a + no * w2_a; condSwapAdd(i_a, j_a, b1, b3, true); int i_b = nb * k1_b + no * w1_b; int j_b = nb * k2_b + no * w2_b; condSwapAdd(i_b, j_b, b2, b4, true); const CudaComplex<RealAlias<Scalar>> Ga_1 = G_up[i_a + ldgu * j_a]; const CudaComplex<RealAlias<Scalar>> Gb_1 = G_up[i_b + ldgu * j_b]; contribution = complex_factor * (Ga_1 * Gb_1); } { int w1_a(w1); int w2_a(g4_helper.wexMinus(w2, w_ex)); int k1_a(k1); int k2_a(g4_helper.kexMinus(k2, k_ex)); g4_helper.extendGIndicesMultiBand(k1_a, k2_a, w1_a, w2_a); int w1_b(g4_helper.wexMinus(w1, w_ex)); int w2_b(w2); int k1_b(g4_helper.kexMinus(k1, k_ex)); int k2_b(k2); g4_helper.extendGIndicesMultiBand(k1_b, k2_b, w1_b, w2_b); int i_a = nb * k1_a + no * w1_a; int j_a = nb * k2_a + no * w2_a; condSwapAdd(i_a, j_a, b1, b4, true); int i_b = nb * k1_b + no * w1_b; int j_b = nb * k2_b + no * w2_b; condSwapAdd(i_b, j_b, b2, b3, true); const CudaComplex<RealAlias<Scalar>> Ga_1 = G_up[i_a + ldgu * j_a]; const CudaComplex<RealAlias<Scalar>> Gb_1 = G_up[i_b + ldgu * j_b]; contribution -= complex_factor * (Ga_1 * Gb_1); } } decltype(G4) const result_ptr = G4 + local_g4_index; if (atomic) dca::linalg::atomicAdd(result_ptr, contribution); else *result_ptr += contribution; } template <typename Scalar, FourPointType type, typename SignType> double updateG4NoSpin(Scalar* G4, const Scalar* G_up, const int ldgu, const SignType factor, bool atomic, cudaStream_t stream, std::size_t start, std::size_t end) { constexpr const std::size_t n_threads = 256; const unsigned n_blocks = dca::util::ceilDiv(end - start, n_threads); using dca::util::GPUTypeConversion; updateG4KernelNoSpin<dca::util::CUDATypeMap<Scalar>, type><<<n_blocks, n_threads, 0, stream>>>( castGPUType(G4), castGPUType(G_up), ldgu, GPUTypeConversion(factor), atomic, start, end); // Check for errors. auto err = cudaPeekAtLastError(); if (err != cudaSuccess) { linalg::util::printErrorMessage(err, __FUNCTION__, __FILE__, __LINE__); throw(std::runtime_error("CUDA failed to launch the G4 kernel.")); } const std::size_t n_updates = end - start; switch (type) { // Note: sign flips are ignored and a single complex * real multiplication is // present in all modes. case FourPointType::PARTICLE_HOLE_TRANSVERSE: // Each update of a G4 entry involves 2 complex additions and 2 complex multiplications. return 18. * n_updates; case FourPointType::PARTICLE_HOLE_MAGNETIC: // Each update of a G4 entry involves 3 complex additions and 3 complex multiplications. return 26. * n_updates; case FourPointType::PARTICLE_HOLE_CHARGE: // Each update of a G4 entry involves 3 complex additions and 3 complex multiplications. return 26. * n_updates; case FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_UP: // Each update of a G4 entry involves 3 complex additions and 4 complex multiplications. return 32 * n_updates; case FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_DOWN: // Each update of a G4 entry involves 2 complex additions and 2 complex multiplications. return 18. * n_updates; case FourPointType::PARTICLE_PARTICLE_UP_DOWN: // Each update of a G4 entry involves 2 complex additions and 2 complex multiplications. return 18. * n_updates; default: throw(std::logic_error("Invalid mode")); } } // Explicit instantiation. template void computeGSingleband<float>(std::complex<float>* G, int ldg, const std::complex<float>* G0, int nk, int nw, Loading Loading @@ -902,6 +1058,19 @@ double updateG4<std::complex<double>, FourPointType::PARTICLE_PARTICLE_UP_DOWN, const std::complex<double>* G_down, const int ldgd, const std::complex<double> factor, bool atomic, cudaStream_t stream, std::size_t start, std::size_t end); // Non spin symmetric template double updateG4NoSpin<std::complex<float>, FourPointType::PARTICLE_PARTICLE_UP_DOWN, std::complex<float>>( std::complex<float>* G4, const std::complex<float>* G_up, const int ldgu, const std::complex<float> factor, bool atomic, cudaStream_t stream, std::size_t start, std::size_t end); template double updateG4NoSpin<std::complex<double>, FourPointType::PARTICLE_PARTICLE_UP_DOWN, std::complex<double>>( std::complex<double>* G4, const std::complex<double>* G_up, const int ldgu, const std::complex<double> factor, bool atomic, cudaStream_t stream, std::size_t start, std::size_t end); // template<> double updateG4< FourPointType::PARTICLE_HOLE_TRANSVERSE>( // std::complex<float>* G4, const std::complex<float>* G_up, const int ldgu, // const std::complex<float>* G_down, const int ldgd, const std::int8_t factor, bool atomic, Loading
test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/CMakeLists.txt +2 −2 Original line number Diff line number Diff line Loading @@ -58,8 +58,8 @@ dca_add_gtest(tp_accumulator_gpu_test ) dca_add_gtest(tp_accumulator_complex_g0_gpu_test FAST CUDA GTEST_MAIN INCLUDE_DIRS ${DCA_INCLUDE_DIRS};${PROJECT_SOURCE_DIR} LIBS ${DCA_LIBS} ${KERNELS_LIB} ) Loading
test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_complex_g0_gpu_test.cpp +156 −18 Original line number Diff line number Diff line Loading @@ -40,6 +40,21 @@ using McOptions = MockMcOptions<Scalar>; constexpr bool update_baseline = false; constexpr bool write_G4s = true; #ifdef DCA_HAVE_ADIOS2 adios2::ADIOS* adios_ptr; #endif #ifdef DCA_HAVE_MPI #include "dca/parallel/mpi_concurrency/mpi_concurrency.hpp" dca::parallel::MPIConcurrency* concurrency_ptr; #else #include "dca/parallel/no_concurrency/no_concurrency.hpp" dca::parallel::NoConcurrency* concurrency_ptr; #endif #define INPUT_DIR \ DCA_SOURCE_DIR "/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/" Loading @@ -49,29 +64,55 @@ using ConfigGenerator = dca::testing::AccumulationTest<std::complex<double>>; using Configuration = ConfigGenerator::Configuration; using Sample = ConfigGenerator::Sample; using TpAccumulatorComplexG0GpuTest = dca::testing::G0Setup<Scalar, dca::testing::LatticeRashba, dca::ClusterSolverId::CT_AUX, input_file>; template <typename SCALAR> struct TpAccumulatorComplexG0GpuTest : public ::testing::Test { using G0Setup = dca::testing::G0SetupBare<SCALAR, dca::testing::LatticeRashba, dca::ClusterSolverId::CT_AUX, input_file>; virtual void SetUp() { host_setup.SetUp(); gpu_setup.SetUp(); } virtual void TearDown() {} G0Setup host_setup; G0Setup gpu_setup; }; uint loop_counter = 0; TEST_F(TpAccumulatorComplexG0GpuTest, Accumulate) { dca::linalg::util::initializeMagma(); using TestTypes = ::testing::Types<std::complex<double>>; TYPED_TEST_CASE(TpAccumulatorComplexG0GpuTest, TestTypes); #define TYPING_PREFACE \ using Scalar = TypeParam; \ using ConfigGenerator = dca::testing::AccumulationTest<Scalar>; \ using Configuration = typename ConfigGenerator::Configuration; \ using Sample = typename ConfigGenerator::Sample; const std::array<int, 2> n{35, 0}; TYPED_TEST(TpAccumulatorComplexG0GpuTest, Accumulate) { TYPING_PREFACE const std::array<int, 2> n{18, 22}; Sample M; Configuration config; ConfigGenerator::prepareConfiguration(config, M, TpAccumulatorComplexG0GpuTest::BDmn::dmn_size(), TpAccumulatorComplexG0GpuTest::RDmn::dmn_size(), parameters_.get_beta(), n); using FourPointType = dca::phys::FourPointType; ConfigGenerator::prepareConfiguration(config, M, TpAccumulatorComplexG0GpuTest<Scalar>::G0Setup::BDmn::dmn_size(), TpAccumulatorComplexG0GpuTest<Scalar>::G0Setup::RDmn::dmn_size(), this->host_setup.parameters_.get_beta(), n); std::vector<FourPointType> four_point_channels{ FourPointType::PARTICLE_PARTICLE_UP_DOWN}; using namespace dca::phys; parameters_.set_four_point_channels(std::vector<FourPointType>{FourPointType::PARTICLE_PARTICLE_UP_DOWN}); this->host_setup.parameters_.set_four_point_channels(four_point_channels); this->gpu_setup.parameters_.set_four_point_channels(four_point_channels); dca::phys::solver::accumulator::TpAccumulator<Parameters, dca::DistType::NONE, dca::linalg::CPU> accumulatorHost( data_->G0_k_w_cluster_excluded, parameters_); dca::phys::solver::accumulator::TpAccumulator<Parameters, dca::DistType::NONE, dca::linalg::GPU> accumulatorDevice( data_->G0_k_w_cluster_excluded, parameters_); const int8_t sign = 1; dca::phys::solver::accumulator::TpAccumulator<decltype(this->host_setup.parameters_), dca::DistType::NONE, dca::linalg::CPU> accumulatorHost( this->host_setup.data_->G0_k_w_cluster_excluded, this->host_setup.parameters_); dca::phys::solver::accumulator::TpAccumulator<decltype(this->gpu_setup.parameters_), dca::DistType::NONE, dca::linalg::GPU> accumulatorDevice( this->gpu_setup.data_->G0_k_w_cluster_excluded, this->gpu_setup.parameters_); const std::complex<double> sign = {1.0, 0.0}; accumulatorDevice.resetAccumulation(loop_counter); accumulatorDevice.accumulate(M, config, sign); Loading @@ -83,10 +124,107 @@ TEST_F(TpAccumulatorComplexG0GpuTest, Accumulate) { ++loop_counter; for (std::size_t channel = 0; channel < accumulatorHost.num_channels(); ++channel) { const auto diff = dca::func::util::difference(accumulatorHost.get_G4()[channel], #ifdef DCA_HAVE_ADIOS2 if (write_G4s) { dca::io::Writer writer(*adios_ptr, *concurrency_ptr, "ADIOS2", true); dca::io::Writer writer_h5(*adios_ptr, *concurrency_ptr, "HDF5", true); writer.open_file("tp_gpu_test_complex_G0_G4.bp"); writer_h5.open_file("tp_gpu_test_complex_G0_G4.hdf5"); this->host_setup.parameters_.write(writer); this->host_setup.parameters_.write(writer_h5); this->host_setup.data_->write(writer); this->host_setup.data_->write(writer_h5); for (std::size_t channel = 0; channel < accumulatorHost.get_G4().size(); ++channel) { std::string channel_str = dca::phys::toString(this->host_setup.parameters_.get_four_point_channels()[channel]); writer.execute("accumulatorHOST_" + channel_str, accumulatorHost.get_G4()[channel]); writer.execute("accumulatorDevice_" + channel_str, accumulatorDevice.get_G4()[channel]); writer_h5.execute("accumulatorHOST_" + channel_str, accumulatorHost.get_G4()[channel]); writer_h5.execute("accumulatorDevice_" + channel_str, accumulatorDevice.get_G4()[channel]); } writer_h5.execute("accumulatorDevice_G_0", accumulatorDevice.get_G_Debug()[0]); writer_h5.execute("accumulatorDevice_G_1", accumulatorDevice.get_G_Debug()[1]); writer_h5.execute("accumulatorHOST_G", accumulatorHost.get_G_Debug()); #ifndef NDEBUG const auto& G_up = accumulatorDevice.get_G_Debug()[0]; const auto& G_down = accumulatorDevice.get_G_Debug()[1]; using Parameters = decltype(this->host_setup.parameters_); using TpComplex = typename decltype(accumulatorDevice)::TpComplex; using HostSpinSepG = dca::linalg::ReshapableMatrix<TpComplex, dca::linalg::CPU, dca::linalg::util::PinnedAllocator<TpComplex>>; std::array<HostSpinSepG, 2> G_spin_separated{G_up.size(), G_down.size()}; using WTpExtDmn = dca::func::dmn_0<domains::vertex_frequency_domain<domains::EXTENDED>>; using KDmn = typename Parameters::KClusterDmn; using BDmn = dca::func::dmn_0<domains::electron_band_domain>; using SDmn = dca::func::dmn_0<domains::electron_spin_domain>; auto& g_all = accumulatorHost.get_G_Debug(); for (int spin = 0; spin < SDmn::dmn_size(); ++spin) { auto& g_this_spin = G_spin_separated[spin]; auto g_it = g_this_spin.begin(); for (int w1 = 0; w1 < WTpExtDmn::dmn_size(); ++w1) for (int k1 = 0; k1 < KDmn::dmn_size(); ++k1) for (int b1 = 0; b1 < BDmn::dmn_size(); ++b1) for (int w2 = 0; w2 < WTpExtDmn::dmn_size(); ++w2) for (int k2 = 0; k2 < KDmn::dmn_size(); ++k2) for (int b2 = 0; b2 < BDmn::dmn_size(); ++b2, ++g_it) *g_it = g_all(b1, b2, spin, k2, k1, w2, w1); } writer_h5.execute("accumulatorHOST_G_0", G_spin_separated[0]); writer_h5.execute("accumulatorHOST_G_1", G_spin_separated[1]); for (int i = 0; i < G_up.size().first; ++i) for (int j = 0; j < G_up.size().second; ++j) { EXPECT_NEAR(G_up(i, j).real(), G_spin_separated[0](i, j).real(), 1E-12) << "( " << i << ", " << j << " )"; EXPECT_NEAR(G_up(i, j).imag(), G_spin_separated[0](i, j).imag(), 1E-12) << "( " << i << ", " << j << " )"; EXPECT_NEAR(G_down(i, j).real(), G_spin_separated[1](i, j).real(), 1E-12) << "( " << i << ", " << j << " )"; EXPECT_NEAR(G_down(i, j).imag(), G_spin_separated[1](i, j).imag(), 1E-12) << "( " << i << ", " << j << " )"; } #endif writer.close_file(); writer_h5.close_file(); } #endif std::cout << "blocks: " << dca::util::ceilDiv(int(accumulatorHost.get_G4()[0].size()), 256) << '\n'; for (std::size_t channel = 0; channel < accumulatorHost.get_G4().size(); ++channel) { auto diff = dca::func::util::difference(accumulatorHost.get_G4()[channel], accumulatorDevice.get_G4()[channel]); EXPECT_GT(5e-7, diff.l_inf); EXPECT_GT(5e-7, diff.l_inf) << "channel: " << dca::phys::toString(four_point_channels[channel]); } } int main(int argc, char** argv) { #ifdef DCA_HAVE_MPI dca::parallel::MPIConcurrency concurrency(argc, argv); concurrency_ptr = &concurrency; #else dca::parallel::NoConcurrency concurrency(argc, argv); concurrency_ptr = &concurrency; #endif dca::linalg::util::initializeMagma(); #ifdef DCA_HAVE_ADIOS2 // ADIOS expects MPI_COMM pointer or nullptr adios2::ADIOS adios("", concurrency_ptr->get(), false); adios_ptr = &adios; #endif ::testing::InitGoogleTest(&argc, argv); // ::testing::TestEventListeners& listeners = ::testing::UnitTest::GetInstance()->listeners(); // delete listeners.Release(listeners.default_result_printer()); // listeners.Append(new dca::testing::MinimalistPrinter); int result = RUN_ALL_TESTS(); return result; }
include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_cpu.hpp +2 −2 File changed.Contains only whitespace changes. Show changes