working (for Rashba) complex g0 G4 GPU (ddf28be3) · Commits · NDIP / Tool Sources / Direct-Geometry Spectroscopy / DCA / DCA Main

include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/kernels_interface.hpp

+8 −2

Original line number	Diff line number	Diff line
		@@ -41,8 +41,14 @@ void computeGMultiband(std::complex<Real>* G, int ldg, const std::complex<Real>*

		// Updates G4 in the range [start, end)
		template <typename Scalar, FourPointType type, typename SignType>
		double updateG4(Scalar* G4, const Scalar* G_dn, const int ldgd,
		const Scalar* G_up, const int ldgu, const SignType factor,
		double updateG4(Scalar* G4, const Scalar* G_up, const int ldgu,
		const Scalar* G_dn, const int ldgd, const SignType factor,
		bool atomic, cudaStream_t stream, std::size_t start,
		std::size_t end);

		// Updates G4 in the range [start, end)
		template <typename Scalar, FourPointType type, typename SignType>
		double updateG4NoSpin(Scalar* G4, const Scalar* G_up, const int ldgu, const SignType factor,
		bool atomic, cudaStream_t stream, std::size_t start,
		std::size_t end);

include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_gpu.hpp

+48 −38

Original line number	Diff line number	Diff line
		@@ -195,6 +195,7 @@ protected:
		using Base::G4_;
		using Base::multiple_accumulators_;
		using Base::n_bands_;

		// using Base::n_pos_frqs_;

		using Base::non_density_density_;
		@@ -250,7 +251,9 @@ protected:
		#endif

		#ifndef NDEBUG
		std::array<linalg::ReshapableMatrix<TpComplex, linalg::CPU, dca::linalg::util::PinnedAllocator<TpComplex>>,2> G_debug_;
		std::array<
		linalg::ReshapableMatrix<TpComplex, linalg::CPU, dca::linalg::util::PinnedAllocator<TpComplex>>, 2>
		G_debug_;
		#endif

		#ifndef DCA_HAVE_GPU_AWARE_MPI
		@@ -307,7 +310,6 @@ void TpAccumulator<Parameters, DT, linalg::GPU>::resetG4() {
		G4_channel.setStream(reset_stream);
		G4_channel.resizeNoCopy(G4_[0].size());
		G4_channel.setToZero(reset_stream);

		}
		catch (std::bad_alloc& err) {
		std::cerr << "Failed to allocate G4 on device.\n";
		@@ -419,6 +421,7 @@ double TpAccumulator<Parameters, DT, linalg::GPU>::updateG4(const std::size_t ch
		uint64_t start = Base::G4_[0].get_start();
		uint64_t end =
		Base::G4_[0].get_end() + 1; // because the kernel expects this to be one past the end index
		if constexpr (Base::spin_symmetric_) {
		switch (channel) {
		case FourPointType::PARTICLE_HOLE_TRANSVERSE:
		return details::updateG4<TpComplex, FourPointType::PARTICLE_HOLE_TRANSVERSE>(
		@@ -454,6 +457,14 @@ double TpAccumulator<Parameters, DT, linalg::GPU>::updateG4(const std::size_t ch
		throw std::logic_error("Specified four point type not implemented by tp_accumulator_gpu.");
		}
		}
		else {
		if (channel == FourPointType::PARTICLE_PARTICLE_UP_DOWN )
		return details::updateG4NoSpin<TpComplex, FourPointType::PARTICLE_PARTICLE_UP_DOWN>(
		get_G4Dev()[channel_index].ptr(), G_[0].ptr(), G_[0].leadingDimension(), factor,
		multiple_accumulators_, queues_[0], start, end);
		}
		throw std::logic_error("Specified four point type not implemented by tp_accumulator_gpu.");
		}

		template <class Parameters, DistType DT>
		void TpAccumulator<Parameters, DT, linalg::GPU>::finalize() {
		@@ -498,8 +509,7 @@ const std::vector<typename TpAccumulator<Parameters, DT, linalg::GPU>::Base::TpG

		#ifndef NDEBUG
		template <class Parameters, DistType DT>
		const auto& TpAccumulator<
		Parameters, DT, linalg::GPU>::get_G_Debug() {
		const auto& TpAccumulator<Parameters, DT, linalg::GPU>::get_G_Debug() {
		if (G_debug_.empty())
		throw std::logic_error("There is no G4 stored in this class.");

src/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_kernels.cu

+169 −0

Original line number	Diff line number	Diff line
		@@ -745,6 +745,162 @@ double updateG4(Scalar* G4, const Scalar* G_up, const int ldgu, const Scalar* G_
		}
		}

		template <typename Scalar, FourPointType type, typename SignType>
		__global__ void updateG4KernelNoSpin(CudaComplex<RealAlias<Scalar>>* __restrict__ G4,
		const CudaComplex<RealAlias<Scalar>>* __restrict__ G_up,
		const int ldgu,
		const SignType factor, const bool atomic,
		const uint64_t start, const uint64_t end) {
		// TODO: reduce code duplication.
		// TODO: decrease, if possible, register pressure. E.g. a single thread computes all bands.

		const uint64_t local_g4_index =
		static_cast<uint64_t>(blockIdx.x) * static_cast<uint64_t>(blockDim.x) +
		static_cast<uint64_t>(threadIdx.x);

		const uint64_t g4_index = local_g4_index + start;

		if (g4_index >= end) { // out of domain.
		return;
		}

		Scalar complex_factor;
		dca::linalg::assign(complex_factor, factor);
		const Scalar sign_over_2 = 0.5 * complex_factor;

		int b1, b2, b3, b4, k1, k2, k_ex, w1, w2, w_ex;
		g4_helper.unrollIndex(g4_index, b1, b2, b3, b4, k1, w1, k2, w2, k_ex, w_ex);

		const int nb = g4_helper.get_bands();
		const int nk = g4_helper.get_cluster_size();

		CudaComplex<RealAlias<Scalar>> contribution;
		const unsigned no = nk * nb;

		// This code needs to be repeated over and over. This happens in getGMultiband in the cpu
		// implementation. The gpu code is structed differently so without signficant restructing this
		// can't happen in the extendGIndiciesMultiBand routines.
		auto condSwapAdd = [](int& ia, int& ib, const int ba, const int bb, const bool cond) {
		if (cond) {
		ia += bb;
		ib += ba;
		}
		else {
		ia += ba;
		ib += bb;
		}
		};
		// Compute the contribution to G4. In all the products of Green's function of type Ga * Gb,
		// the dependency on the bands is implied as Ga(b1, b2) * Gb(b2, b3). Sums and differences with
		// the exchange momentum, implies the same operation is performed with the exchange frequency.
		// See tp_accumulator.hpp for more details.
		if constexpr (type == FourPointType::PARTICLE_PARTICLE_UP_DOWN) {
		{
		int w1_a(w1);
		int w2_a(w2);
		int k1_a(k1);
		int k2_a(k2);
		g4_helper.extendGIndicesMultiBand(k1_a, k2_a, w1_a, w2_a);

		int w1_b(g4_helper.wexMinus(w1, w_ex));
		int w2_b(g4_helper.wexMinus(w2, w_ex));
		int k1_b = g4_helper.kexMinus(k1, k_ex);
		int k2_b = g4_helper.kexMinus(k2, k_ex);
		g4_helper.extendGIndicesMultiBand(k1_b, k2_b, w1_b, w2_b);

		int i_a = nb * k1_a + no * w1_a;
		int j_a = nb * k2_a + no * w2_a;
		condSwapAdd(i_a, j_a, b1, b3, true);

		int i_b = nb * k1_b + no * w1_b;
		int j_b = nb * k2_b + no * w2_b;
		condSwapAdd(i_b, j_b, b2, b4, true);

		const CudaComplex<RealAlias<Scalar>> Ga_1 = G_up[i_a + ldgu * j_a];
		const CudaComplex<RealAlias<Scalar>> Gb_1 = G_up[i_b + ldgu * j_b];

		contribution = complex_factor * (Ga_1 * Gb_1);
		}
		{
		int w1_a(w1);
		int w2_a(g4_helper.wexMinus(w2, w_ex));
		int k1_a(k1);
		int k2_a(g4_helper.kexMinus(k2, k_ex));
		g4_helper.extendGIndicesMultiBand(k1_a, k2_a, w1_a, w2_a);

		int w1_b(g4_helper.wexMinus(w1, w_ex));
		int w2_b(w2);
		int k1_b(g4_helper.kexMinus(k1, k_ex));
		int k2_b(k2);
		g4_helper.extendGIndicesMultiBand(k1_b, k2_b, w1_b, w2_b);

		int i_a = nb * k1_a + no * w1_a;
		int j_a = nb * k2_a + no * w2_a;
		condSwapAdd(i_a, j_a, b1, b4, true);

		int i_b = nb * k1_b + no * w1_b;
		int j_b = nb * k2_b + no * w2_b;
		condSwapAdd(i_b, j_b, b2, b3, true);

		const CudaComplex<RealAlias<Scalar>> Ga_1 = G_up[i_a + ldgu * j_a];
		const CudaComplex<RealAlias<Scalar>> Gb_1 = G_up[i_b + ldgu * j_b];

		contribution -= complex_factor * (Ga_1 * Gb_1);
		}
		}
		decltype(G4) const result_ptr = G4 + local_g4_index;
		if (atomic)
		dca::linalg::atomicAdd(result_ptr, contribution);
		else
		*result_ptr += contribution;
		}

		template <typename Scalar, FourPointType type, typename SignType>
		double updateG4NoSpin(Scalar* G4, const Scalar* G_up, const int ldgu, const SignType factor, bool atomic, cudaStream_t stream,
		std::size_t start, std::size_t end) {
		constexpr const std::size_t n_threads = 256;
		const unsigned n_blocks = dca::util::ceilDiv(end - start, n_threads);

		using dca::util::GPUTypeConversion;
		updateG4KernelNoSpin<dca::util::CUDATypeMap<Scalar>, type><<<n_blocks, n_threads, 0, stream>>>(
		castGPUType(G4), castGPUType(G_up), ldgu,
		GPUTypeConversion(factor), atomic, start, end);

		// Check for errors.
		auto err = cudaPeekAtLastError();
		if (err != cudaSuccess) {
		linalg::util::printErrorMessage(err, __FUNCTION__, __FILE__, __LINE__);
		throw(std::runtime_error("CUDA failed to launch the G4 kernel."));
		}

		const std::size_t n_updates = end - start;
		switch (type) {
		// Note: sign flips are ignored and a single complex * real multiplication is
		// present in all modes.
		case FourPointType::PARTICLE_HOLE_TRANSVERSE:
		// Each update of a G4 entry involves 2 complex additions and 2 complex multiplications.
		return 18. * n_updates;
		case FourPointType::PARTICLE_HOLE_MAGNETIC:
		// Each update of a G4 entry involves 3 complex additions and 3 complex multiplications.
		return 26. * n_updates;
		case FourPointType::PARTICLE_HOLE_CHARGE:
		// Each update of a G4 entry involves 3 complex additions and 3 complex multiplications.
		return 26. * n_updates;
		case FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_UP:
		// Each update of a G4 entry involves 3 complex additions and 4 complex multiplications.
		return 32 * n_updates;
		case FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_DOWN:
		// Each update of a G4 entry involves 2 complex additions and 2 complex multiplications.
		return 18. * n_updates;
		case FourPointType::PARTICLE_PARTICLE_UP_DOWN:
		// Each update of a G4 entry involves 2 complex additions and 2 complex multiplications.
		return 18. * n_updates;
		default:
		throw(std::logic_error("Invalid mode"));
		}
		}


		// Explicit instantiation.
		template void computeGSingleband<float>(std::complex<float>* G, int ldg,
		const std::complex<float>* G0, int nk, int nw,
		@@ -902,6 +1058,19 @@ double updateG4<std::complex<double>, FourPointType::PARTICLE_PARTICLE_UP_DOWN,
		const std::complex<double>* G_down, const int ldgd, const std::complex<double> factor,
		bool atomic, cudaStream_t stream, std::size_t start, std::size_t end);

		// Non spin symmetric
		template double updateG4NoSpin<std::complex<float>, FourPointType::PARTICLE_PARTICLE_UP_DOWN, std::complex<float>>(
		std::complex<float>* G4, const std::complex<float>* G_up, const int ldgu,
		const std::complex<float> factor, bool atomic,
		cudaStream_t stream, std::size_t start, std::size_t end);

		template double updateG4NoSpin<std::complex<double>, FourPointType::PARTICLE_PARTICLE_UP_DOWN, std::complex<double>>(
		std::complex<double>* G4, const std::complex<double>* G_up, const int ldgu,
		const std::complex<double> factor, bool atomic,
		cudaStream_t stream, std::size_t start, std::size_t end);



		// template<> double updateG4< FourPointType::PARTICLE_HOLE_TRANSVERSE>(
		// std::complex<float>* G4, const std::complex<float>* G_up, const int ldgu,
		// const std::complex<float>* G_down, const int ldgd, const std::int8_t factor, bool atomic,

test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/CMakeLists.txt

+2 −2

Original line number	Diff line number	Diff line
		@@ -58,8 +58,8 @@ dca_add_gtest(tp_accumulator_gpu_test
		)

		dca_add_gtest(tp_accumulator_complex_g0_gpu_test
		FAST
		CUDA
		GTEST_MAIN
		INCLUDE_DIRS ${DCA_INCLUDE_DIRS};${PROJECT_SOURCE_DIR}
		LIBS ${DCA_LIBS} ${KERNELS_LIB}
		)

test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_complex_g0_gpu_test.cpp

+156 −18

Original line number	Diff line number	Diff line
		@@ -40,6 +40,21 @@ using McOptions = MockMcOptions<Scalar>;

		constexpr bool update_baseline = false;

		constexpr bool write_G4s = true;

		#ifdef DCA_HAVE_ADIOS2
		adios2::ADIOS* adios_ptr;
		#endif

		#ifdef DCA_HAVE_MPI
		#include "dca/parallel/mpi_concurrency/mpi_concurrency.hpp"
		dca::parallel::MPIConcurrency* concurrency_ptr;
		#else
		#include "dca/parallel/no_concurrency/no_concurrency.hpp"
		dca::parallel::NoConcurrency* concurrency_ptr;
		#endif


		#define INPUT_DIR \
		DCA_SOURCE_DIR "/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/"

		@@ -49,29 +64,55 @@ using ConfigGenerator = dca::testing::AccumulationTest<std::complex<double>>;
		using Configuration = ConfigGenerator::Configuration;
		using Sample = ConfigGenerator::Sample;

		using TpAccumulatorComplexG0GpuTest =
		dca::testing::G0Setup<Scalar, dca::testing::LatticeRashba, dca::ClusterSolverId::CT_AUX, input_file>;
		template <typename SCALAR>
		struct TpAccumulatorComplexG0GpuTest : public ::testing::Test {
		using G0Setup = dca::testing::G0SetupBare<SCALAR, dca::testing::LatticeRashba,
		dca::ClusterSolverId::CT_AUX, input_file>;
		virtual void SetUp() {
		host_setup.SetUp();
		gpu_setup.SetUp();
		}

		virtual void TearDown() {}
		G0Setup host_setup;
		G0Setup gpu_setup;
		};


		uint loop_counter = 0;

		TEST_F(TpAccumulatorComplexG0GpuTest, Accumulate) {
		dca::linalg::util::initializeMagma();
		using TestTypes = ::testing::Types<std::complex<double>>;
		TYPED_TEST_CASE(TpAccumulatorComplexG0GpuTest, TestTypes);

		#define TYPING_PREFACE \
		using Scalar = TypeParam; \
		using ConfigGenerator = dca::testing::AccumulationTest<Scalar>; \
		using Configuration = typename ConfigGenerator::Configuration; \
		using Sample = typename ConfigGenerator::Sample;

		const std::array<int, 2> n{35, 0};
		TYPED_TEST(TpAccumulatorComplexG0GpuTest, Accumulate) {
		TYPING_PREFACE

		const std::array<int, 2> n{18, 22};
		Sample M;
		Configuration config;
		ConfigGenerator::prepareConfiguration(config, M, TpAccumulatorComplexG0GpuTest::BDmn::dmn_size(),
		TpAccumulatorComplexG0GpuTest::RDmn::dmn_size(),
		parameters_.get_beta(), n);
		using FourPointType = dca::phys::FourPointType;

		ConfigGenerator::prepareConfiguration(config, M, TpAccumulatorComplexG0GpuTest<Scalar>::G0Setup::BDmn::dmn_size(),
		TpAccumulatorComplexG0GpuTest<Scalar>::G0Setup::RDmn::dmn_size(),
		this->host_setup.parameters_.get_beta(), n);
		std::vector<FourPointType> four_point_channels{
		FourPointType::PARTICLE_PARTICLE_UP_DOWN};

		using namespace dca::phys;
		parameters_.set_four_point_channels(std::vector<FourPointType>{FourPointType::PARTICLE_PARTICLE_UP_DOWN});
		this->host_setup.parameters_.set_four_point_channels(four_point_channels);
		this->gpu_setup.parameters_.set_four_point_channels(four_point_channels);

		dca::phys::solver::accumulator::TpAccumulator<Parameters, dca::DistType::NONE, dca::linalg::CPU> accumulatorHost(
		data_->G0_k_w_cluster_excluded, parameters_);
		dca::phys::solver::accumulator::TpAccumulator<Parameters, dca::DistType::NONE, dca::linalg::GPU> accumulatorDevice(
		data_->G0_k_w_cluster_excluded, parameters_);
		const int8_t sign = 1;
		dca::phys::solver::accumulator::TpAccumulator<decltype(this->host_setup.parameters_), dca::DistType::NONE, dca::linalg::CPU> accumulatorHost(
		this->host_setup.data_->G0_k_w_cluster_excluded, this->host_setup.parameters_);
		dca::phys::solver::accumulator::TpAccumulator<decltype(this->gpu_setup.parameters_), dca::DistType::NONE, dca::linalg::GPU> accumulatorDevice(
		this->gpu_setup.data_->G0_k_w_cluster_excluded, this->gpu_setup.parameters_);
		const std::complex<double> sign = {1.0, 0.0};

		accumulatorDevice.resetAccumulation(loop_counter);
		accumulatorDevice.accumulate(M, config, sign);
		@@ -83,10 +124,107 @@ TEST_F(TpAccumulatorComplexG0GpuTest, Accumulate) {

		++loop_counter;

		for (std::size_t channel = 0; channel < accumulatorHost.num_channels(); ++channel) {
		const auto diff = dca::func::util::difference(accumulatorHost.get_G4()[channel],
		#ifdef DCA_HAVE_ADIOS2
		if (write_G4s) {
		dca::io::Writer writer(adios_ptr, concurrency_ptr, "ADIOS2", true);
		dca::io::Writer writer_h5(adios_ptr, concurrency_ptr, "HDF5", true);

		writer.open_file("tp_gpu_test_complex_G0_G4.bp");
		writer_h5.open_file("tp_gpu_test_complex_G0_G4.hdf5");

		this->host_setup.parameters_.write(writer);
		this->host_setup.parameters_.write(writer_h5);
		this->host_setup.data_->write(writer);
		this->host_setup.data_->write(writer_h5);

		for (std::size_t channel = 0; channel < accumulatorHost.get_G4().size(); ++channel) {
		std::string channel_str =
		dca::phys::toString(this->host_setup.parameters_.get_four_point_channels()[channel]);
		writer.execute("accumulatorHOST_" + channel_str, accumulatorHost.get_G4()[channel]);
		writer.execute("accumulatorDevice_" + channel_str, accumulatorDevice.get_G4()[channel]);
		writer_h5.execute("accumulatorHOST_" + channel_str, accumulatorHost.get_G4()[channel]);
		writer_h5.execute("accumulatorDevice_" + channel_str, accumulatorDevice.get_G4()[channel]);
		}
		writer_h5.execute("accumulatorDevice_G_0", accumulatorDevice.get_G_Debug()[0]);
		writer_h5.execute("accumulatorDevice_G_1", accumulatorDevice.get_G_Debug()[1]);
		writer_h5.execute("accumulatorHOST_G", accumulatorHost.get_G_Debug());

		#ifndef NDEBUG
		const auto& G_up = accumulatorDevice.get_G_Debug()[0];
		const auto& G_down = accumulatorDevice.get_G_Debug()[1];
		using Parameters = decltype(this->host_setup.parameters_);
		using TpComplex = typename decltype(accumulatorDevice)::TpComplex;
		using HostSpinSepG = dca::linalg::ReshapableMatrix<TpComplex, dca::linalg::CPU,
		dca::linalg::util::PinnedAllocator<TpComplex>>;
		std::array<HostSpinSepG, 2> G_spin_separated{G_up.size(), G_down.size()};

		using WTpExtDmn = dca::func::dmn_0<domains::vertex_frequency_domain<domains::EXTENDED>>;
		using KDmn = typename Parameters::KClusterDmn;
		using BDmn = dca::func::dmn_0<domains::electron_band_domain>;
		using SDmn = dca::func::dmn_0<domains::electron_spin_domain>;
		auto& g_all = accumulatorHost.get_G_Debug();

		for (int spin = 0; spin < SDmn::dmn_size(); ++spin) {
		auto& g_this_spin = G_spin_separated[spin];
		auto g_it = g_this_spin.begin();
		for (int w1 = 0; w1 < WTpExtDmn::dmn_size(); ++w1)
		for (int k1 = 0; k1 < KDmn::dmn_size(); ++k1)
		for (int b1 = 0; b1 < BDmn::dmn_size(); ++b1)
		for (int w2 = 0; w2 < WTpExtDmn::dmn_size(); ++w2)
		for (int k2 = 0; k2 < KDmn::dmn_size(); ++k2)
		for (int b2 = 0; b2 < BDmn::dmn_size(); ++b2, ++g_it)
		*g_it = g_all(b1, b2, spin, k2, k1, w2, w1);
		}

		writer_h5.execute("accumulatorHOST_G_0", G_spin_separated[0]);
		writer_h5.execute("accumulatorHOST_G_1", G_spin_separated[1]);

		for (int i = 0; i < G_up.size().first; ++i)
		for (int j = 0; j < G_up.size().second; ++j) {
		EXPECT_NEAR(G_up(i, j).real(), G_spin_separated[0](i, j).real(), 1E-12) << "( " << i << ", " << j << " )";
		EXPECT_NEAR(G_up(i, j).imag(), G_spin_separated[0](i, j).imag(), 1E-12) << "( " << i << ", " << j << " )";
		EXPECT_NEAR(G_down(i, j).real(), G_spin_separated[1](i, j).real(), 1E-12) << "( " << i << ", " << j << " )";
		EXPECT_NEAR(G_down(i, j).imag(), G_spin_separated[1](i, j).imag(), 1E-12) << "( " << i << ", " << j << " )";
		}
		#endif

		writer.close_file();
		writer_h5.close_file();
		}
		#endif

		std::cout << "blocks: " << dca::util::ceilDiv(int(accumulatorHost.get_G4()[0].size()), 256) << '\n';

		for (std::size_t channel = 0; channel < accumulatorHost.get_G4().size(); ++channel) {
		auto diff = dca::func::util::difference(accumulatorHost.get_G4()[channel],
		accumulatorDevice.get_G4()[channel]);
		EXPECT_GT(5e-7, diff.l_inf);
		EXPECT_GT(5e-7, diff.l_inf) << "channel: " << dca::phys::toString(four_point_channels[channel]);
		}
		}


		int main(int argc, char** argv) {
		#ifdef DCA_HAVE_MPI
		dca::parallel::MPIConcurrency concurrency(argc, argv);
		concurrency_ptr = &concurrency;
		#else
		dca::parallel::NoConcurrency concurrency(argc, argv);
		concurrency_ptr = &concurrency;
		#endif

		dca::linalg::util::initializeMagma();

		#ifdef DCA_HAVE_ADIOS2
		// ADIOS expects MPI_COMM pointer or nullptr
		adios2::ADIOS adios("", concurrency_ptr->get(), false);
		adios_ptr = &adios;
		#endif
		::testing::InitGoogleTest(&argc, argv);

		// ::testing::TestEventListeners& listeners = ::testing::UnitTest::GetInstance()->listeners();
		// delete listeners.Release(listeners.default_result_printer());
		// listeners.Append(new dca::testing::MinimalistPrinter);

		int result = RUN_ALL_TESTS();
		return result;
		}

include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_cpu.hpp

+2 −2

File changed.

Contains only whitespace changes.