Loading cmake/dca_config.cmake +10 −2 Original line number Diff line number Diff line Loading @@ -118,8 +118,8 @@ configure_file("${PROJECT_SOURCE_DIR}/include/dca/config/lattice_model.hpp.in" ################################################################################ # Select the profiler type and enable auto-tuning. set(DCA_PROFILER "None" CACHE STRING "Profiler type, options are: None | Counting | PAPI.") set_property(CACHE DCA_PROFILER PROPERTY STRINGS None Counting PAPI) set(DCA_PROFILER "None" CACHE STRING "Profiler type, options are: None | Counting | PAPI | Cuda.") set_property(CACHE DCA_PROFILER PROPERTY STRINGS None Counting PAPI Cuda) if (DCA_PROFILER STREQUAL "Counting") set(DCA_PROFILING_EVENT_TYPE dca::profiling::time_event<std::size_t>) Loading @@ -133,6 +133,14 @@ elseif (DCA_PROFILER STREQUAL "PAPI") set(DCA_PROFILER_TYPE dca::profiling::CountingProfiler<Event>) set(DCA_PROFILER_INCLUDE "dca/profiling/counting_profiler.hpp") # Note: this profiler requires using the PTHREAD library and CUDA_TOOLS_EXT_LIBRARY elseif (DCA_PROFILER STREQUAL "Cuda") set(DCA_PROFILING_EVENT_INCLUDE "dca/profiling/events/time.hpp") set(DCA_PROFILING_EVENT_TYPE "void") set(DCA_PROFILER_TYPE dca::profiling::CudaProfiler) set(DCA_PROFILER_INCLUDE "dca/profiling/cuda_profiler.hpp") link_libraries(${CUDA_nvToolsExt_LIBRARY}) else() # DCA_PROFILER = None # The NullProfiler doesn't have an event type. set(DCA_PROFILING_EVENT_TYPE void) Loading include/dca/profiling/cuda_profiler.hpp 0 → 100644 +73 −0 Original line number Diff line number Diff line // Copyright (C) 2018 ETH Zurich // Copyright (C) 2018 UT-Battelle, LLC // All rights reserved. // // See LICENSE for terms of usage. // See CITATION.md for citation guidelines, if DCA++ is used for scientific publications. // // Author: Giovanni Balduzzi (gbalduzz@itp.phys.ethz.ch) // // Annotate CPU functions in NVPROF. Depends on PTHREAD. #ifndef DCA_PROFILING_CUDA_PROFILER_HPP #define DCA_PROFILING_CUDA_PROFILER_HPP #include <string> #include <pthread.h> #include <nvToolsExtCuda.h> namespace dca { namespace profiling { // dca::profiling:: class CudaProfiler { public: inline CudaProfiler(const std::string& functionName_, const std::string& fileName_, int line); inline CudaProfiler(const std::string& functionName_, const std::string& fileName_, int line, int thread_id); inline ~CudaProfiler(); static void start() { active_ = true; } static void stop() { active_ = false; } static void start_threading(int /*id*/){} static void stop_threading(int /*id*/){} template<class Concurrency> static void stop(Concurrency& /*conc*/, const std::string& /*name*/){} private: inline static bool active_ = false; }; CudaProfiler::CudaProfiler(const std::string& function_name, const std::string& category_name, int /*line*/) { if (active_) { nvtxNameOsThread(pthread_self(), "Master"); nvtxRangePush((function_name + " - " + category_name).c_str()); } } CudaProfiler::CudaProfiler(const std::string& function_name, const std::string& category_name, int /*line*/, int id) { if (active_) { nvtxNameOsThread(pthread_self(), ("Thread" + std::to_string(id)).c_str()); nvtxRangePush((function_name + " - " + category_name).c_str()); } } CudaProfiler::~CudaProfiler() { if (active_) nvtxRangePop(); } } // namespace profiling } // namespace dca #endif // DCA_PROFILING_CUDA_PROFILER_HPP test/performance/phys/ctaux/CMakeLists.txt +2 −1 Original line number Diff line number Diff line Loading @@ -8,5 +8,6 @@ target_compile_definitions(ctaux_walker_performance_test PRIVATE DCA_SOURCE_DIR= if (DCA_HAVE_CUDA) target_compile_definitions(ctaux_walker_performance_test PRIVATE DCA_HAVE_CUDA) target_link_libraries(ctaux_walker_performance_test PRIVATE ${DCA_LIBS}) target_link_directories(ctaux_walker_performance_test PRIVATE ${CUDA_TOOLKIT_ROOT_DIR}/lib64/) target_link_libraries(ctaux_walker_performance_test PRIVATE nvToolsExt ${DCA_LIBS}) endif () test/performance/phys/ctaux/ctaux_walker_performance_test.cpp +61 −72 Original line number Diff line number Diff line Loading @@ -18,9 +18,8 @@ #include <cuda_profiler_api.h> #endif #include "dca/io/hdf5/hdf5_reader.hpp" #include "dca/io/json/json_reader.hpp" #include "dca/config/mc_options.hpp" #include "dca/io/json/json_reader.hpp" #include "dca/math/random/std_random_wrapper.hpp" #include "dca/phys/dca_data/dca_data.hpp" #include "dca/phys/domains/cluster/symmetries/point_groups/2d/2d_square.hpp" Loading @@ -30,9 +29,7 @@ #include "dca/parallel/no_threading/no_threading.hpp" #include "dca/phys/parameters/parameters.hpp" #include "dca/profiling/events/time.hpp" #include "dca/profiling/counting_profiler.hpp" #include "dca/profiling/events/time_event.hpp" #include "dca/util/ignore.hpp" #include "dca/profiling/cuda_profiler.hpp" const std::string input_dir = DCA_SOURCE_DIR "/test/performance/phys/ctaux/"; Loading @@ -41,42 +38,44 @@ using Lattice = dca::phys::models::bilayer_lattice<dca::phys::domains::D4>; using Model = dca::phys::models::TightBindingModel<Lattice>; using Threading = dca::parallel::NoThreading; using Concurrency = dca::parallel::NoConcurrency; using Profiler = dca::profiling::CountingProfiler<dca::profiling::time_event<std::size_t>>; using Profiler = dca::profiling::CudaProfiler; using Parameters = dca::phys::params::Parameters<Concurrency, Threading, Profiler, Model, RngType, dca::phys::solver::CT_AUX>; using Data = dca::phys::DcaData<Parameters>; using Real = dca::config::McOptions::MCScalar; template <dca::linalg::DeviceType device_t> using Walker = dca::phys::solver::ctaux::CtauxWalker<device_t, Parameters, Data, dca::config::McOptions::MCScalar>; using Walker = dca::phys::solver::ctaux::CtauxWalker<device_t, Parameters, Data, Real>; int main(int argc, char** argv) { bool test_cpu(true), test_gpu(true); #ifdef DCA_HAVE_CUDA int submatrix_size = -1; int n_warmup = 30; int n_sweeps = 5; dca::util::ignoreUnused(test_gpu); int n_walkers = -1; for (int i = 0; i < argc; ++i) { const std::string arg(argv[i]); if (arg == "--skip_cpu") test_cpu = false; else if (arg == "--skip_gpu") test_gpu = false; else if (arg == "--submatrix_size") if (arg == "--submatrix_size") submatrix_size = std::atoi(argv[i + 1]); else if (arg == "--n_sweeps") n_sweeps = std::atoi(argv[i + 1]); else if (arg == "--n_walkers") n_walkers = std::atoi(argv[i + 1]); } Concurrency concurrency(argc, argv); Parameters parameters("", concurrency); parameters.read_input_and_broadcast<dca::io::JSONReader>(input_dir + "bilayer_lattice_input.json"); if (submatrix_size != -1) parameters.set_max_submatrix_size(submatrix_size); parameters.update_model(); parameters.update_domains(); if (submatrix_size != -1) parameters.set_max_submatrix_size(submatrix_size); if (n_walkers == -1) n_walkers = parameters.get_walkers(); // Initialize data with G0 computation. Data data(parameters); data.initialize(); Loading @@ -86,72 +85,62 @@ int main(int argc, char** argv) { std::cout << str << ": time taken: " << time.sec + 1e-6 * time.usec << std::endl; }; auto do_sweeps = [¶meters](auto& walker, int n) { auto do_sweeps = [¶meters](auto& walker, int n, bool verbose) { for (int i = 0; i < n; ++i) { walker.doSweep(); if (verbose) walker.updateShell(i, n); } }; std::cout << "\n\n *********** GPU integration ***************\n"; std::cout << "Nr walkers: " << n_walkers << "\n\n"; std::cout << "Integrating with max-submatrix-size: " << parameters.get_max_submatrix_size() << std::endl; if (test_cpu) { std::cout << "\n\n *********** CPU integration ***************\n" << std::endl; dca::linalg::util::initializeMagma(); dca::linalg::util::resizeHandleContainer(n_walkers); // TODO: always start if the profiler supports the writing of multiple files. if (!test_gpu) Profiler::start(); RngType::resetCounter(); std::vector<RngType> rngs; std::vector<Walker<dca::linalg::GPU>> walkers; rngs.reserve(n_walkers); walkers.reserve(n_walkers); for (int i = 0; i < n_walkers; ++i) { rngs.emplace_back(0, 1, 0); walkers.emplace_back(parameters, data, rngs.back(), i); } // Do one integration step. RngType rng(0, 1, 0); Walker<dca::linalg::CPU> walker(parameters, data, rng, 0); walker.initialize(); std::vector<std::future<void>> fs; dca::parallel::ThreadPool pool(n_walkers); for (int i = 0; i < n_walkers; ++i) { fs.push_back(pool.enqueue([&do_sweeps, &walkers, i, n_warmup]() { walkers[i].initialize(); do_sweeps(walkers[i], n_warmup, i == 0); walkers[i].is_thermalized() = true; })); } do_sweeps(walker, n_warmup); for (auto& f : fs) f.get(); fs.clear(); std::cout << "\n Warmed up.\n" << std::endl; // Timed section. cudaProfilerStart(); Profiler::start(); dca::profiling::WallTime start_t; do_sweeps(walker, n_sweeps); dca::profiling::WallTime integration_t; walker.printSummary(); std::cout << std::endl; printTime("Integration CPU", start_t, integration_t); if (!test_gpu) Profiler::stop(concurrency, "profile_cpu.txt"); for (int i = 0; i < n_walkers; ++i) { fs.push_back(pool.enqueue( [&do_sweeps, &walkers, i, n_sweeps]() { do_sweeps(walkers[i], n_sweeps, i == 0); })); } for (auto& f : fs) f.get(); #ifdef DCA_HAVE_CUDA if (test_gpu) { std::cout << "\n\n *********** GPU integration ***************\n\n"; std::cout.flush(); dca::linalg::util::initializeMagma(); Profiler::start(); RngType::resetCounter(); RngType rng(0, 1, 0); Walker<dca::linalg::GPU> walker_gpu(parameters, data, rng, 0); walker_gpu.initialize(); do_sweeps(walker_gpu, n_warmup); std::cout << "\n Warmed up.\n" << std::endl; // Timed section. cudaProfilerStart(); dca::profiling::WallTime start_t; do_sweeps(walker_gpu, n_sweeps); dca::profiling::WallTime integration_t; Profiler::stop(); cudaProfilerStop(); walker_gpu.printSummary(); Profiler::stop(concurrency, "profile_gpu.txt"); std::cout << std::endl; printTime("Integration GPU", start_t, integration_t); } #endif // DCA_HAVE_CUDA } Loading
cmake/dca_config.cmake +10 −2 Original line number Diff line number Diff line Loading @@ -118,8 +118,8 @@ configure_file("${PROJECT_SOURCE_DIR}/include/dca/config/lattice_model.hpp.in" ################################################################################ # Select the profiler type and enable auto-tuning. set(DCA_PROFILER "None" CACHE STRING "Profiler type, options are: None | Counting | PAPI.") set_property(CACHE DCA_PROFILER PROPERTY STRINGS None Counting PAPI) set(DCA_PROFILER "None" CACHE STRING "Profiler type, options are: None | Counting | PAPI | Cuda.") set_property(CACHE DCA_PROFILER PROPERTY STRINGS None Counting PAPI Cuda) if (DCA_PROFILER STREQUAL "Counting") set(DCA_PROFILING_EVENT_TYPE dca::profiling::time_event<std::size_t>) Loading @@ -133,6 +133,14 @@ elseif (DCA_PROFILER STREQUAL "PAPI") set(DCA_PROFILER_TYPE dca::profiling::CountingProfiler<Event>) set(DCA_PROFILER_INCLUDE "dca/profiling/counting_profiler.hpp") # Note: this profiler requires using the PTHREAD library and CUDA_TOOLS_EXT_LIBRARY elseif (DCA_PROFILER STREQUAL "Cuda") set(DCA_PROFILING_EVENT_INCLUDE "dca/profiling/events/time.hpp") set(DCA_PROFILING_EVENT_TYPE "void") set(DCA_PROFILER_TYPE dca::profiling::CudaProfiler) set(DCA_PROFILER_INCLUDE "dca/profiling/cuda_profiler.hpp") link_libraries(${CUDA_nvToolsExt_LIBRARY}) else() # DCA_PROFILER = None # The NullProfiler doesn't have an event type. set(DCA_PROFILING_EVENT_TYPE void) Loading
include/dca/profiling/cuda_profiler.hpp 0 → 100644 +73 −0 Original line number Diff line number Diff line // Copyright (C) 2018 ETH Zurich // Copyright (C) 2018 UT-Battelle, LLC // All rights reserved. // // See LICENSE for terms of usage. // See CITATION.md for citation guidelines, if DCA++ is used for scientific publications. // // Author: Giovanni Balduzzi (gbalduzz@itp.phys.ethz.ch) // // Annotate CPU functions in NVPROF. Depends on PTHREAD. #ifndef DCA_PROFILING_CUDA_PROFILER_HPP #define DCA_PROFILING_CUDA_PROFILER_HPP #include <string> #include <pthread.h> #include <nvToolsExtCuda.h> namespace dca { namespace profiling { // dca::profiling:: class CudaProfiler { public: inline CudaProfiler(const std::string& functionName_, const std::string& fileName_, int line); inline CudaProfiler(const std::string& functionName_, const std::string& fileName_, int line, int thread_id); inline ~CudaProfiler(); static void start() { active_ = true; } static void stop() { active_ = false; } static void start_threading(int /*id*/){} static void stop_threading(int /*id*/){} template<class Concurrency> static void stop(Concurrency& /*conc*/, const std::string& /*name*/){} private: inline static bool active_ = false; }; CudaProfiler::CudaProfiler(const std::string& function_name, const std::string& category_name, int /*line*/) { if (active_) { nvtxNameOsThread(pthread_self(), "Master"); nvtxRangePush((function_name + " - " + category_name).c_str()); } } CudaProfiler::CudaProfiler(const std::string& function_name, const std::string& category_name, int /*line*/, int id) { if (active_) { nvtxNameOsThread(pthread_self(), ("Thread" + std::to_string(id)).c_str()); nvtxRangePush((function_name + " - " + category_name).c_str()); } } CudaProfiler::~CudaProfiler() { if (active_) nvtxRangePop(); } } // namespace profiling } // namespace dca #endif // DCA_PROFILING_CUDA_PROFILER_HPP
test/performance/phys/ctaux/CMakeLists.txt +2 −1 Original line number Diff line number Diff line Loading @@ -8,5 +8,6 @@ target_compile_definitions(ctaux_walker_performance_test PRIVATE DCA_SOURCE_DIR= if (DCA_HAVE_CUDA) target_compile_definitions(ctaux_walker_performance_test PRIVATE DCA_HAVE_CUDA) target_link_libraries(ctaux_walker_performance_test PRIVATE ${DCA_LIBS}) target_link_directories(ctaux_walker_performance_test PRIVATE ${CUDA_TOOLKIT_ROOT_DIR}/lib64/) target_link_libraries(ctaux_walker_performance_test PRIVATE nvToolsExt ${DCA_LIBS}) endif ()
test/performance/phys/ctaux/ctaux_walker_performance_test.cpp +61 −72 Original line number Diff line number Diff line Loading @@ -18,9 +18,8 @@ #include <cuda_profiler_api.h> #endif #include "dca/io/hdf5/hdf5_reader.hpp" #include "dca/io/json/json_reader.hpp" #include "dca/config/mc_options.hpp" #include "dca/io/json/json_reader.hpp" #include "dca/math/random/std_random_wrapper.hpp" #include "dca/phys/dca_data/dca_data.hpp" #include "dca/phys/domains/cluster/symmetries/point_groups/2d/2d_square.hpp" Loading @@ -30,9 +29,7 @@ #include "dca/parallel/no_threading/no_threading.hpp" #include "dca/phys/parameters/parameters.hpp" #include "dca/profiling/events/time.hpp" #include "dca/profiling/counting_profiler.hpp" #include "dca/profiling/events/time_event.hpp" #include "dca/util/ignore.hpp" #include "dca/profiling/cuda_profiler.hpp" const std::string input_dir = DCA_SOURCE_DIR "/test/performance/phys/ctaux/"; Loading @@ -41,42 +38,44 @@ using Lattice = dca::phys::models::bilayer_lattice<dca::phys::domains::D4>; using Model = dca::phys::models::TightBindingModel<Lattice>; using Threading = dca::parallel::NoThreading; using Concurrency = dca::parallel::NoConcurrency; using Profiler = dca::profiling::CountingProfiler<dca::profiling::time_event<std::size_t>>; using Profiler = dca::profiling::CudaProfiler; using Parameters = dca::phys::params::Parameters<Concurrency, Threading, Profiler, Model, RngType, dca::phys::solver::CT_AUX>; using Data = dca::phys::DcaData<Parameters>; using Real = dca::config::McOptions::MCScalar; template <dca::linalg::DeviceType device_t> using Walker = dca::phys::solver::ctaux::CtauxWalker<device_t, Parameters, Data, dca::config::McOptions::MCScalar>; using Walker = dca::phys::solver::ctaux::CtauxWalker<device_t, Parameters, Data, Real>; int main(int argc, char** argv) { bool test_cpu(true), test_gpu(true); #ifdef DCA_HAVE_CUDA int submatrix_size = -1; int n_warmup = 30; int n_sweeps = 5; dca::util::ignoreUnused(test_gpu); int n_walkers = -1; for (int i = 0; i < argc; ++i) { const std::string arg(argv[i]); if (arg == "--skip_cpu") test_cpu = false; else if (arg == "--skip_gpu") test_gpu = false; else if (arg == "--submatrix_size") if (arg == "--submatrix_size") submatrix_size = std::atoi(argv[i + 1]); else if (arg == "--n_sweeps") n_sweeps = std::atoi(argv[i + 1]); else if (arg == "--n_walkers") n_walkers = std::atoi(argv[i + 1]); } Concurrency concurrency(argc, argv); Parameters parameters("", concurrency); parameters.read_input_and_broadcast<dca::io::JSONReader>(input_dir + "bilayer_lattice_input.json"); if (submatrix_size != -1) parameters.set_max_submatrix_size(submatrix_size); parameters.update_model(); parameters.update_domains(); if (submatrix_size != -1) parameters.set_max_submatrix_size(submatrix_size); if (n_walkers == -1) n_walkers = parameters.get_walkers(); // Initialize data with G0 computation. Data data(parameters); data.initialize(); Loading @@ -86,72 +85,62 @@ int main(int argc, char** argv) { std::cout << str << ": time taken: " << time.sec + 1e-6 * time.usec << std::endl; }; auto do_sweeps = [¶meters](auto& walker, int n) { auto do_sweeps = [¶meters](auto& walker, int n, bool verbose) { for (int i = 0; i < n; ++i) { walker.doSweep(); if (verbose) walker.updateShell(i, n); } }; std::cout << "\n\n *********** GPU integration ***************\n"; std::cout << "Nr walkers: " << n_walkers << "\n\n"; std::cout << "Integrating with max-submatrix-size: " << parameters.get_max_submatrix_size() << std::endl; if (test_cpu) { std::cout << "\n\n *********** CPU integration ***************\n" << std::endl; dca::linalg::util::initializeMagma(); dca::linalg::util::resizeHandleContainer(n_walkers); // TODO: always start if the profiler supports the writing of multiple files. if (!test_gpu) Profiler::start(); RngType::resetCounter(); std::vector<RngType> rngs; std::vector<Walker<dca::linalg::GPU>> walkers; rngs.reserve(n_walkers); walkers.reserve(n_walkers); for (int i = 0; i < n_walkers; ++i) { rngs.emplace_back(0, 1, 0); walkers.emplace_back(parameters, data, rngs.back(), i); } // Do one integration step. RngType rng(0, 1, 0); Walker<dca::linalg::CPU> walker(parameters, data, rng, 0); walker.initialize(); std::vector<std::future<void>> fs; dca::parallel::ThreadPool pool(n_walkers); for (int i = 0; i < n_walkers; ++i) { fs.push_back(pool.enqueue([&do_sweeps, &walkers, i, n_warmup]() { walkers[i].initialize(); do_sweeps(walkers[i], n_warmup, i == 0); walkers[i].is_thermalized() = true; })); } do_sweeps(walker, n_warmup); for (auto& f : fs) f.get(); fs.clear(); std::cout << "\n Warmed up.\n" << std::endl; // Timed section. cudaProfilerStart(); Profiler::start(); dca::profiling::WallTime start_t; do_sweeps(walker, n_sweeps); dca::profiling::WallTime integration_t; walker.printSummary(); std::cout << std::endl; printTime("Integration CPU", start_t, integration_t); if (!test_gpu) Profiler::stop(concurrency, "profile_cpu.txt"); for (int i = 0; i < n_walkers; ++i) { fs.push_back(pool.enqueue( [&do_sweeps, &walkers, i, n_sweeps]() { do_sweeps(walkers[i], n_sweeps, i == 0); })); } for (auto& f : fs) f.get(); #ifdef DCA_HAVE_CUDA if (test_gpu) { std::cout << "\n\n *********** GPU integration ***************\n\n"; std::cout.flush(); dca::linalg::util::initializeMagma(); Profiler::start(); RngType::resetCounter(); RngType rng(0, 1, 0); Walker<dca::linalg::GPU> walker_gpu(parameters, data, rng, 0); walker_gpu.initialize(); do_sweeps(walker_gpu, n_warmup); std::cout << "\n Warmed up.\n" << std::endl; // Timed section. cudaProfilerStart(); dca::profiling::WallTime start_t; do_sweeps(walker_gpu, n_sweeps); dca::profiling::WallTime integration_t; Profiler::stop(); cudaProfilerStop(); walker_gpu.printSummary(); Profiler::stop(concurrency, "profile_gpu.txt"); std::cout << std::endl; printTime("Integration GPU", start_t, integration_t); } #endif // DCA_HAVE_CUDA }