Commit 2b5cdd79 authored by gbalduzz's avatar gbalduzz
Browse files

annotate CPU calls in Nvprof

parent 1c675be4
Loading
Loading
Loading
Loading
+10 −2
Original line number Diff line number Diff line
@@ -118,8 +118,8 @@ configure_file("${PROJECT_SOURCE_DIR}/include/dca/config/lattice_model.hpp.in"

################################################################################
# Select the profiler type and enable auto-tuning.
set(DCA_PROFILER "None" CACHE STRING "Profiler type, options are: None | Counting | PAPI.")
set_property(CACHE DCA_PROFILER PROPERTY STRINGS None Counting PAPI)
set(DCA_PROFILER "None" CACHE STRING "Profiler type, options are: None | Counting | PAPI | Cuda.")
set_property(CACHE DCA_PROFILER PROPERTY STRINGS None Counting PAPI Cuda)

if (DCA_PROFILER STREQUAL "Counting")
  set(DCA_PROFILING_EVENT_TYPE dca::profiling::time_event<std::size_t>)
@@ -133,6 +133,14 @@ elseif (DCA_PROFILER STREQUAL "PAPI")
  set(DCA_PROFILER_TYPE dca::profiling::CountingProfiler<Event>)
  set(DCA_PROFILER_INCLUDE "dca/profiling/counting_profiler.hpp")

# Note: this profiler requires using the PTHREAD library and CUDA_TOOLS_EXT_LIBRARY
elseif (DCA_PROFILER STREQUAL "Cuda")
  set(DCA_PROFILING_EVENT_INCLUDE "dca/profiling/events/time.hpp")
  set(DCA_PROFILING_EVENT_TYPE "void")
  set(DCA_PROFILER_TYPE dca::profiling::CudaProfiler)
  set(DCA_PROFILER_INCLUDE "dca/profiling/cuda_profiler.hpp")
  link_libraries(${CUDA_nvToolsExt_LIBRARY})

else()  # DCA_PROFILER = None
  # The NullProfiler doesn't have an event type.
  set(DCA_PROFILING_EVENT_TYPE void)
+73 −0
Original line number Diff line number Diff line
// Copyright (C) 2018 ETH Zurich
// Copyright (C) 2018 UT-Battelle, LLC
// All rights reserved.
//
// See LICENSE for terms of usage.
// See CITATION.md for citation guidelines, if DCA++ is used for scientific publications.
//
// Author: Giovanni Balduzzi (gbalduzz@itp.phys.ethz.ch)
//
// Annotate CPU functions in NVPROF. Depends on PTHREAD.

#ifndef DCA_PROFILING_CUDA_PROFILER_HPP
#define DCA_PROFILING_CUDA_PROFILER_HPP

#include <string>
#include <pthread.h>
#include <nvToolsExtCuda.h>

namespace dca {
namespace profiling {
// dca::profiling::

class CudaProfiler {
public:
  inline CudaProfiler(const std::string& functionName_, const std::string& fileName_, int line);

  inline CudaProfiler(const std::string& functionName_, const std::string& fileName_, int line,
                      int thread_id);

  inline ~CudaProfiler();

  static void start() {
    active_ = true;
  }

  static void stop() {
    active_ = false;
  }

  static void start_threading(int /*id*/){}
  static void stop_threading(int /*id*/){}
  template<class Concurrency>
  static void stop(Concurrency& /*conc*/, const std::string& /*name*/){}

private:
  inline static bool active_ = false;
};

CudaProfiler::CudaProfiler(const std::string& function_name, const std::string& category_name,
                           int /*line*/) {
  if (active_) {
    nvtxNameOsThread(pthread_self(), "Master");
    nvtxRangePush((function_name + " - " + category_name).c_str());
  }
}

CudaProfiler::CudaProfiler(const std::string& function_name, const std::string& category_name,
                           int /*line*/, int id) {
  if (active_) {
    nvtxNameOsThread(pthread_self(), ("Thread" + std::to_string(id)).c_str());
    nvtxRangePush((function_name + " - " + category_name).c_str());
  }
}

CudaProfiler::~CudaProfiler() {
  if (active_)
    nvtxRangePop();
}

}  // namespace profiling
}  // namespace dca

#endif  // DCA_PROFILING_CUDA_PROFILER_HPP
+2 −1
Original line number Diff line number Diff line
@@ -8,5 +8,6 @@ target_compile_definitions(ctaux_walker_performance_test PRIVATE DCA_SOURCE_DIR=

if (DCA_HAVE_CUDA)
  target_compile_definitions(ctaux_walker_performance_test PRIVATE DCA_HAVE_CUDA)
  target_link_libraries(ctaux_walker_performance_test PRIVATE ${DCA_LIBS})
  target_link_directories(ctaux_walker_performance_test PRIVATE ${CUDA_TOOLKIT_ROOT_DIR}/lib64/)
  target_link_libraries(ctaux_walker_performance_test PRIVATE nvToolsExt ${DCA_LIBS})
endif ()
+61 −72
Original line number Diff line number Diff line
@@ -18,9 +18,8 @@
#include <cuda_profiler_api.h>
#endif

#include "dca/io/hdf5/hdf5_reader.hpp"
#include "dca/io/json/json_reader.hpp"
#include "dca/config/mc_options.hpp"
#include "dca/io/json/json_reader.hpp"
#include "dca/math/random/std_random_wrapper.hpp"
#include "dca/phys/dca_data/dca_data.hpp"
#include "dca/phys/domains/cluster/symmetries/point_groups/2d/2d_square.hpp"
@@ -30,9 +29,7 @@
#include "dca/parallel/no_threading/no_threading.hpp"
#include "dca/phys/parameters/parameters.hpp"
#include "dca/profiling/events/time.hpp"
#include "dca/profiling/counting_profiler.hpp"
#include "dca/profiling/events/time_event.hpp"
#include "dca/util/ignore.hpp"
#include "dca/profiling/cuda_profiler.hpp"

const std::string input_dir = DCA_SOURCE_DIR "/test/performance/phys/ctaux/";

@@ -41,42 +38,44 @@ using Lattice = dca::phys::models::bilayer_lattice<dca::phys::domains::D4>;
using Model = dca::phys::models::TightBindingModel<Lattice>;
using Threading = dca::parallel::NoThreading;
using Concurrency = dca::parallel::NoConcurrency;
using Profiler = dca::profiling::CountingProfiler<dca::profiling::time_event<std::size_t>>;
using Profiler = dca::profiling::CudaProfiler;
using Parameters = dca::phys::params::Parameters<Concurrency, Threading, Profiler, Model, RngType,
                                                 dca::phys::solver::CT_AUX>;
using Data = dca::phys::DcaData<Parameters>;
using Real = dca::config::McOptions::MCScalar;
template <dca::linalg::DeviceType device_t>
using Walker =
    dca::phys::solver::ctaux::CtauxWalker<device_t, Parameters, Data, dca::config::McOptions::MCScalar>;
using Walker = dca::phys::solver::ctaux::CtauxWalker<device_t, Parameters, Data, Real>;

int main(int argc, char** argv) {
  bool test_cpu(true), test_gpu(true);
#ifdef DCA_HAVE_CUDA
  int submatrix_size = -1;
  int n_warmup = 30;
  int n_sweeps = 5;
  dca::util::ignoreUnused(test_gpu);
  int n_walkers = -1;

  for (int i = 0; i < argc; ++i) {
    const std::string arg(argv[i]);
    if (arg == "--skip_cpu")
      test_cpu = false;
    else if (arg == "--skip_gpu")
      test_gpu = false;
    else if (arg == "--submatrix_size")
    if (arg == "--submatrix_size")
      submatrix_size = std::atoi(argv[i + 1]);
    else if (arg == "--n_sweeps")
      n_sweeps = std::atoi(argv[i + 1]);
    else if (arg == "--n_walkers")
      n_walkers = std::atoi(argv[i + 1]);
  }

  Concurrency concurrency(argc, argv);
  Parameters parameters("", concurrency);
  parameters.read_input_and_broadcast<dca::io::JSONReader>(input_dir +
                                                           "bilayer_lattice_input.json");
  if (submatrix_size != -1)
    parameters.set_max_submatrix_size(submatrix_size);

  parameters.update_model();
  parameters.update_domains();

  if (submatrix_size != -1)
    parameters.set_max_submatrix_size(submatrix_size);
  if (n_walkers == -1)
    n_walkers = parameters.get_walkers();

  // Initialize data with G0 computation.
  Data data(parameters);
  data.initialize();
@@ -86,72 +85,62 @@ int main(int argc, char** argv) {
    std::cout << str << ": time taken: " << time.sec + 1e-6 * time.usec << std::endl;
  };

  auto do_sweeps = [&parameters](auto& walker, int n) {
  auto do_sweeps = [&parameters](auto& walker, int n, bool verbose) {
    for (int i = 0; i < n; ++i) {
      walker.doSweep();
      if (verbose)
        walker.updateShell(i, n);
    }
  };
  std::cout << "\n\n  *********** GPU integration  ***************\n";
  std::cout << "Nr walkers: " << n_walkers << "\n\n";

  std::cout << "Integrating with max-submatrix-size: " << parameters.get_max_submatrix_size()
            << std::endl;

  if (test_cpu) {
    std::cout << "\n\n  *********** CPU integration  ***************\n" << std::endl;
  dca::linalg::util::initializeMagma();
  dca::linalg::util::resizeHandleContainer(n_walkers);

    // TODO: always start if the profiler supports the writing of multiple files.
    if (!test_gpu)
      Profiler::start();
  RngType::resetCounter();
  std::vector<RngType> rngs;
  std::vector<Walker<dca::linalg::GPU>> walkers;
  rngs.reserve(n_walkers);
  walkers.reserve(n_walkers);
  for (int i = 0; i < n_walkers; ++i) {
    rngs.emplace_back(0, 1, 0);
    walkers.emplace_back(parameters, data, rngs.back(), i);
  }

    // Do one integration step.
    RngType rng(0, 1, 0);
    Walker<dca::linalg::CPU> walker(parameters, data, rng, 0);
    walker.initialize();
  std::vector<std::future<void>> fs;
  dca::parallel::ThreadPool pool(n_walkers);
  for (int i = 0; i < n_walkers; ++i) {
    fs.push_back(pool.enqueue([&do_sweeps, &walkers, i, n_warmup]() {
      walkers[i].initialize();
      do_sweeps(walkers[i], n_warmup, i == 0);
      walkers[i].is_thermalized() = true;
    }));
  }

    do_sweeps(walker, n_warmup);
  for (auto& f : fs)
    f.get();
  fs.clear();
  std::cout << "\n Warmed up.\n" << std::endl;

  // Timed section.
  cudaProfilerStart();
  Profiler::start();
  dca::profiling::WallTime start_t;
    do_sweeps(walker, n_sweeps);
    dca::profiling::WallTime integration_t;
    walker.printSummary();

    std::cout << std::endl;
    printTime("Integration CPU", start_t, integration_t);

    if (!test_gpu)
      Profiler::stop(concurrency, "profile_cpu.txt");
  for (int i = 0; i < n_walkers; ++i) {
    fs.push_back(pool.enqueue(
        [&do_sweeps, &walkers, i, n_sweeps]() { do_sweeps(walkers[i], n_sweeps, i == 0); }));
  }
  for (auto& f : fs)
    f.get();

#ifdef DCA_HAVE_CUDA
  if (test_gpu) {
    std::cout << "\n\n  *********** GPU integration  ***************\n\n";
    std::cout.flush();
    dca::linalg::util::initializeMagma();

    Profiler::start();

    RngType::resetCounter();
    RngType rng(0, 1, 0);
    Walker<dca::linalg::GPU> walker_gpu(parameters, data, rng, 0);
    walker_gpu.initialize();

    do_sweeps(walker_gpu, n_warmup);
    std::cout << "\n Warmed up.\n" << std::endl;

    // Timed section.
    cudaProfilerStart();
    dca::profiling::WallTime start_t;
    do_sweeps(walker_gpu, n_sweeps);
  dca::profiling::WallTime integration_t;
  Profiler::stop();
  cudaProfilerStop();
    walker_gpu.printSummary();

    Profiler::stop(concurrency, "profile_gpu.txt");

  std::cout << std::endl;
  printTime("Integration GPU", start_t, integration_t);
  }

#endif  // DCA_HAVE_CUDA
}