annotate CPU calls in Nvprof (2b5cdd79) · Commits · NDIP / Tool Sources / Direct-Geometry Spectroscopy / DCA / DCA Main

cmake/dca_config.cmake

+10 −2

Original line number	Diff line number	Diff line
		@@ -118,8 +118,8 @@ configure_file("${PROJECT_SOURCE_DIR}/include/dca/config/lattice_model.hpp.in"

		################################################################################
		# Select the profiler type and enable auto-tuning.
		set(DCA_PROFILER "None" CACHE STRING "Profiler type, options are: None \| Counting \| PAPI.")
		set_property(CACHE DCA_PROFILER PROPERTY STRINGS None Counting PAPI)
		set(DCA_PROFILER "None" CACHE STRING "Profiler type, options are: None \| Counting \| PAPI \| Cuda.")
		set_property(CACHE DCA_PROFILER PROPERTY STRINGS None Counting PAPI Cuda)

		if (DCA_PROFILER STREQUAL "Counting")
		set(DCA_PROFILING_EVENT_TYPE dca::profiling::time_event<std::size_t>)
		@@ -133,6 +133,14 @@ elseif (DCA_PROFILER STREQUAL "PAPI")
		set(DCA_PROFILER_TYPE dca::profiling::CountingProfiler<Event>)
		set(DCA_PROFILER_INCLUDE "dca/profiling/counting_profiler.hpp")

		# Note: this profiler requires using the PTHREAD library and CUDA_TOOLS_EXT_LIBRARY
		elseif (DCA_PROFILER STREQUAL "Cuda")
		set(DCA_PROFILING_EVENT_INCLUDE "dca/profiling/events/time.hpp")
		set(DCA_PROFILING_EVENT_TYPE "void")
		set(DCA_PROFILER_TYPE dca::profiling::CudaProfiler)
		set(DCA_PROFILER_INCLUDE "dca/profiling/cuda_profiler.hpp")
		link_libraries(${CUDA_nvToolsExt_LIBRARY})

		else() # DCA_PROFILER = None
		# The NullProfiler doesn't have an event type.
		set(DCA_PROFILING_EVENT_TYPE void)

include/dca/profiling/cuda_profiler.hpp

0 → 100644

+73 −0

Original line number	Diff line number	Diff line
		// Copyright (C) 2018 ETH Zurich
		// Copyright (C) 2018 UT-Battelle, LLC
		// All rights reserved.
		//
		// See LICENSE for terms of usage.
		// See CITATION.md for citation guidelines, if DCA++ is used for scientific publications.
		//
		// Author: Giovanni Balduzzi (gbalduzz@itp.phys.ethz.ch)
		//
		// Annotate CPU functions in NVPROF. Depends on PTHREAD.

		#ifndef DCA_PROFILING_CUDA_PROFILER_HPP
		#define DCA_PROFILING_CUDA_PROFILER_HPP

		#include <string>
		#include <pthread.h>
		#include <nvToolsExtCuda.h>

		namespace dca {
		namespace profiling {
		// dca::profiling::

		class CudaProfiler {
		public:
		inline CudaProfiler(const std::string& functionName_, const std::string& fileName_, int line);

		inline CudaProfiler(const std::string& functionName_, const std::string& fileName_, int line,
		int thread_id);

		inline ~CudaProfiler();

		static void start() {
		active_ = true;
		}

		static void stop() {
		active_ = false;
		}

		static void start_threading(int /id/){}
		static void stop_threading(int /id/){}
		template<class Concurrency>
		static void stop(Concurrency& /conc/, const std::string& /name/){}

		private:
		inline static bool active_ = false;
		};

		CudaProfiler::CudaProfiler(const std::string& function_name, const std::string& category_name,
		int /line/) {
		if (active_) {
		nvtxNameOsThread(pthread_self(), "Master");
		nvtxRangePush((function_name + " - " + category_name).c_str());
		}
		}

		CudaProfiler::CudaProfiler(const std::string& function_name, const std::string& category_name,
		int /line/, int id) {
		if (active_) {
		nvtxNameOsThread(pthread_self(), ("Thread" + std::to_string(id)).c_str());
		nvtxRangePush((function_name + " - " + category_name).c_str());
		}
		}

		CudaProfiler::~CudaProfiler() {
		if (active_)
		nvtxRangePop();
		}

		} // namespace profiling
		} // namespace dca

		#endif // DCA_PROFILING_CUDA_PROFILER_HPP

test/performance/phys/ctaux/CMakeLists.txt

+2 −1

Original line number	Diff line number	Diff line
		@@ -8,5 +8,6 @@ target_compile_definitions(ctaux_walker_performance_test PRIVATE DCA_SOURCE_DIR=

		if (DCA_HAVE_CUDA)
		target_compile_definitions(ctaux_walker_performance_test PRIVATE DCA_HAVE_CUDA)
		target_link_libraries(ctaux_walker_performance_test PRIVATE ${DCA_LIBS})
		target_link_directories(ctaux_walker_performance_test PRIVATE ${CUDA_TOOLKIT_ROOT_DIR}/lib64/)
		target_link_libraries(ctaux_walker_performance_test PRIVATE nvToolsExt ${DCA_LIBS})
		endif ()

test/performance/phys/ctaux/ctaux_walker_performance_test.cpp

+61 −72

Original line number	Diff line number	Diff line
		@@ -18,9 +18,8 @@
		#include <cuda_profiler_api.h>
		#endif

		#include "dca/io/hdf5/hdf5_reader.hpp"
		#include "dca/io/json/json_reader.hpp"
		#include "dca/config/mc_options.hpp"
		#include "dca/io/json/json_reader.hpp"
		#include "dca/math/random/std_random_wrapper.hpp"
		#include "dca/phys/dca_data/dca_data.hpp"
		#include "dca/phys/domains/cluster/symmetries/point_groups/2d/2d_square.hpp"
		@@ -30,9 +29,7 @@
		#include "dca/parallel/no_threading/no_threading.hpp"
		#include "dca/phys/parameters/parameters.hpp"
		#include "dca/profiling/events/time.hpp"
		#include "dca/profiling/counting_profiler.hpp"
		#include "dca/profiling/events/time_event.hpp"
		#include "dca/util/ignore.hpp"
		#include "dca/profiling/cuda_profiler.hpp"

		const std::string input_dir = DCA_SOURCE_DIR "/test/performance/phys/ctaux/";

		@@ -41,42 +38,44 @@ using Lattice = dca::phys::models::bilayer_lattice<dca::phys::domains::D4>;
		using Model = dca::phys::models::TightBindingModel<Lattice>;
		using Threading = dca::parallel::NoThreading;
		using Concurrency = dca::parallel::NoConcurrency;
		using Profiler = dca::profiling::CountingProfiler<dca::profiling::time_event<std::size_t>>;
		using Profiler = dca::profiling::CudaProfiler;
		using Parameters = dca::phys::params::Parameters<Concurrency, Threading, Profiler, Model, RngType,
		dca::phys::solver::CT_AUX>;
		using Data = dca::phys::DcaData<Parameters>;
		using Real = dca::config::McOptions::MCScalar;
		template <dca::linalg::DeviceType device_t>
		using Walker =
		dca::phys::solver::ctaux::CtauxWalker<device_t, Parameters, Data, dca::config::McOptions::MCScalar>;
		using Walker = dca::phys::solver::ctaux::CtauxWalker<device_t, Parameters, Data, Real>;

		int main(int argc, char** argv) {
		bool test_cpu(true), test_gpu(true);
		#ifdef DCA_HAVE_CUDA
		int submatrix_size = -1;
		int n_warmup = 30;
		int n_sweeps = 5;
		dca::util::ignoreUnused(test_gpu);
		int n_walkers = -1;

		for (int i = 0; i < argc; ++i) {
		const std::string arg(argv[i]);
		if (arg == "--skip_cpu")
		test_cpu = false;
		else if (arg == "--skip_gpu")
		test_gpu = false;
		else if (arg == "--submatrix_size")
		if (arg == "--submatrix_size")
		submatrix_size = std::atoi(argv[i + 1]);
		else if (arg == "--n_sweeps")
		n_sweeps = std::atoi(argv[i + 1]);
		else if (arg == "--n_walkers")
		n_walkers = std::atoi(argv[i + 1]);
		}

		Concurrency concurrency(argc, argv);
		Parameters parameters("", concurrency);
		parameters.read_input_and_broadcast<dca::io::JSONReader>(input_dir +
		"bilayer_lattice_input.json");
		if (submatrix_size != -1)
		parameters.set_max_submatrix_size(submatrix_size);

		parameters.update_model();
		parameters.update_domains();

		if (submatrix_size != -1)
		parameters.set_max_submatrix_size(submatrix_size);
		if (n_walkers == -1)
		n_walkers = parameters.get_walkers();

		// Initialize data with G0 computation.
		Data data(parameters);
		data.initialize();
		@@ -86,72 +85,62 @@ int main(int argc, char** argv) {
		std::cout << str << ": time taken: " << time.sec + 1e-6 * time.usec << std::endl;
		};

		auto do_sweeps = [&parameters](auto& walker, int n) {
		auto do_sweeps = [&parameters](auto& walker, int n, bool verbose) {
		for (int i = 0; i < n; ++i) {
		walker.doSweep();
		if (verbose)
		walker.updateShell(i, n);
		}
		};
		std::cout << "\n\n ********* GPU integration *************\n";
		std::cout << "Nr walkers: " << n_walkers << "\n\n";

		std::cout << "Integrating with max-submatrix-size: " << parameters.get_max_submatrix_size()
		<< std::endl;

		if (test_cpu) {
		std::cout << "\n\n ********* CPU integration *************\n" << std::endl;
		dca::linalg::util::initializeMagma();
		dca::linalg::util::resizeHandleContainer(n_walkers);

		// TODO: always start if the profiler supports the writing of multiple files.
		if (!test_gpu)
		Profiler::start();
		RngType::resetCounter();
		std::vector<RngType> rngs;
		std::vector<Walker<dca::linalg::GPU>> walkers;
		rngs.reserve(n_walkers);
		walkers.reserve(n_walkers);
		for (int i = 0; i < n_walkers; ++i) {
		rngs.emplace_back(0, 1, 0);
		walkers.emplace_back(parameters, data, rngs.back(), i);
		}

		// Do one integration step.
		RngType rng(0, 1, 0);
		Walker<dca::linalg::CPU> walker(parameters, data, rng, 0);
		walker.initialize();
		std::vector<std::future<void>> fs;
		dca::parallel::ThreadPool pool(n_walkers);
		for (int i = 0; i < n_walkers; ++i) {
		fs.push_back(pool.enqueue([&do_sweeps, &walkers, i, n_warmup]() {
		walkers[i].initialize();
		do_sweeps(walkers[i], n_warmup, i == 0);
		walkers[i].is_thermalized() = true;
		}));
		}

		do_sweeps(walker, n_warmup);
		for (auto& f : fs)
		f.get();
		fs.clear();
		std::cout << "\n Warmed up.\n" << std::endl;

		// Timed section.
		cudaProfilerStart();
		Profiler::start();
		dca::profiling::WallTime start_t;
		do_sweeps(walker, n_sweeps);
		dca::profiling::WallTime integration_t;
		walker.printSummary();

		std::cout << std::endl;
		printTime("Integration CPU", start_t, integration_t);

		if (!test_gpu)
		Profiler::stop(concurrency, "profile_cpu.txt");
		for (int i = 0; i < n_walkers; ++i) {
		fs.push_back(pool.enqueue(
		[&do_sweeps, &walkers, i, n_sweeps]() { do_sweeps(walkers[i], n_sweeps, i == 0); }));
		}
		for (auto& f : fs)
		f.get();

		#ifdef DCA_HAVE_CUDA
		if (test_gpu) {
		std::cout << "\n\n ********* GPU integration *************\n\n";
		std::cout.flush();
		dca::linalg::util::initializeMagma();

		Profiler::start();

		RngType::resetCounter();
		RngType rng(0, 1, 0);
		Walker<dca::linalg::GPU> walker_gpu(parameters, data, rng, 0);
		walker_gpu.initialize();

		do_sweeps(walker_gpu, n_warmup);
		std::cout << "\n Warmed up.\n" << std::endl;

		// Timed section.
		cudaProfilerStart();
		dca::profiling::WallTime start_t;
		do_sweeps(walker_gpu, n_sweeps);
		dca::profiling::WallTime integration_t;
		Profiler::stop();
		cudaProfilerStop();
		walker_gpu.printSummary();

		Profiler::stop(concurrency, "profile_gpu.txt");

		std::cout << std::endl;
		printTime("Integration GPU", start_t, integration_t);
		}

		#endif // DCA_HAVE_CUDA
		}