Unverified Commit af70bd20 authored by Peter Doak's avatar Peter Doak Committed by GitHub
Browse files

Distributed G4 implementation (#201)



* apply pipeline ring algorithm in G2 send/recv

* test wip

* add ring G test and verify it works

* main_dca works with one accumulator and if local measurements are equal (w/ 0), accuracy needs verification

* update send/recv tag for ring G alg

make ringG available at compile time, add documentation, clean up code

add python tool that can diff two G4s

 remove distributed test and ensure ring G test runs correctly, add compiler flag where needed

modify G4 tiling method to rank index instead of w decomposed index

reset G4 size, get index boundary right

it works on multiple ransk, unevenly distributed G4 array size

add STL algorithm and clean up code

* swicth nvlink flip from compilation flag to config settings

* remove recv buffer to avoid copy

* temporarily adding GPTL profiling library

* update python tool for G4 diff

* add multi threaded support to ring G by adding thread id and n_acc to tp_acc

* fixing typo

* add comments to mci parameters related to nvlink

* remove gptl from code

* trying to improve memory allocation

* add copyFrom function in RMatrix and modify copy operation in sendbuff

* modify copy operator and add allocate method fo RMatrix, update sendbuff copy and allocation

* add allocation flag to sendbuff and remove unnecessary MPI_Barrier

* remove allocate method in RMatrix and move the allocation into cached_ndft_gpu

* remove allocation in cached_ndft_gpu, use swap op in sendbuff to G_

* cleaned up reshapable matrix assignment.

* compute start and end of G4 linearized 1D index in CPU code and launch 1d thread blocks

* rename nvlink-enabled and nvlink related variables to distributed-g4-enabled to avoid vendor-specific naming

* replace int to uint64_t type for G4 index related variable

* more index processing

* add g4 index back if distributed g4 is not enabled

* fix G4 mem allocation

* comment out gatherv, add doc, and format code

* more formatting, remove MPI related unnecessary code

* clean up the python file and add author info

* offset index in mpi_gatherv to correct pos and cleanup code

* rm std::fill as not necessary

* add wiki doc for distributedg4 and upload helper file

* add uint64_t cast wherever needed

* Make function only allocates portion of G4 on CPU locally

* fix typos

* clean up changes in function constructor

* adding missing mpi.h headers

* rename reset_size to resize

* avoid index overflow in device code

* Added integration test for getComputeRange

* now the subindices info should only get printed in verbose

* fix off by one on kernel code

* rename nb_more_work_ranks to more_work_ranks

* fixing typo

* fix off by one for unbalanced case

* propagate index off changes into kernel code

* demonstrating and checking the function subindexspan

* rename Nb_elements to nb_elements_ in function.hpp

* add missing g4 accumulate guard

* use Module operator and MPITypeMap G2 ScalarType for ringG alg

* Clean up G4 indices computation.
Added missing include.

* quick and not yet dry refactor to disentangle MPI dependence

* fix crash from refactor of tp_accumulator_gpu into _gpu and _mpi_gpu

* update of copyright year and names

* actually add plumbing for runtime distributed G4

* add missing mpi type header to build ringG test

* silenced memory type warning.

* more changes for working runtime distribution choice

* add missing file

* missing header cmdlauncher execute permission

* compiles but fails ringG test due to change of  sense of start_-end_

* partially fixed start end (i Think)

* remove @weili's bug

* careful with the sizes even undistributed G4_ !=  tp_dmn

* fix tp_acc_mpi_gpu and ringG test

* supporting cuda visible devices so ctest can run the ringG test.

* removing needless constructor complication

* ringG fixed on summit, equivalent to the smpiargs for other plat?

* slight modification for smpiargs

* fixing test compilation failure.

* hopefully this will placate gcc 8.3.0 on cray

* more fixes for CI, cautionary comment in function.hpp

* add comment

Co-authored-by: default avatarWeile Wei <lokwei9@gmail.com>
Co-authored-by: default avatargbalduzz <gbalduzz@itp.phys.ethz.ch>
parent 0947da88
Loading
Loading
Loading
Loading
+15 −15
Original line number Diff line number Diff line
@@ -20,6 +20,7 @@
#include "dca/io/json/json_reader.hpp"
#include "dca/util/git_version.hpp"
#include "dca/util/modules.hpp"
#include "dca/application/dca_loop_dispatch.hpp"

int main(int argc, char** argv) {
  if (argc < 2) {
@@ -71,24 +72,23 @@ int main(int argc, char** argv) {
    DcaDataType dca_data(parameters);
    dca_data.initialize();

    DcaLoopType dca_loop(parameters, dca_data, concurrency);

    dca::DistType distribution = parameters.get_g4_distribution();
    switch(distribution) {
    case dca::DistType::MPI:
    {
      Profiler profiler(__FUNCTION__, __FILE__, __LINE__);

      dca_loop.initialize();
      dca_loop.execute();
      dca_loop.finalize();
      DCALoopDispatch<dca::DistType::MPI> dca_loop_dispatch;
      dca_loop_dispatch(parameters, dca_data, concurrency);
    }
      break;
    case dca::DistType::NONE:
    {
      DCALoopDispatch<dca::DistType::NONE> dca_loop_dispatch;
      dca_loop_dispatch(parameters, dca_data, concurrency);
    }
      break;
    }
    
    Profiler::stop(concurrency, parameters.get_filename_profiling());

    if (concurrency.id() == concurrency.first()) {
      std::cout << "\nProcessor " << concurrency.id() << " is writing data." << std::endl;
      dca_loop.write();

      std::cout << "\nFinish time: " << dca::util::print_time() << "\n" << std::endl;
    }
  }
  catch (const std::exception& err) {
    std::cout << "Unhandled exception in main function:\n\t" << err.what();
+4 −0
Original line number Diff line number Diff line
@@ -25,6 +25,10 @@ set(SMPIARGS_FLAG_NOMPI "--smpiargs=none" CACHE STRING
# Let's keep this option in case we need it again in the future.
set(SMPIARGS_FLAG_MPI "" CACHE STRING "Spectrum MPI argument list flag for MPI tests.")

# When we want to us a cuda visible devices restriction we need this flag
set(SMPIARGS_FLAG_MPI_CVD "--smpiargs=-gpu" CACHE STRING 
  "Spectrum MPI argument list for cuda-mpi tests")

# Enable the GPU support.
option(DCA_WITH_CUDA "Enable GPU support." ON)

+3 −3
Original line number Diff line number Diff line
@@ -206,13 +206,13 @@ set_property(CACHE DCA_CLUSTER_SOLVER PROPERTY STRINGS CT-AUX SS-CT-HYB)

if (DCA_CLUSTER_SOLVER STREQUAL "CT-AUX")
  set(DCA_CLUSTER_SOLVER_NAME dca::phys::solver::CT_AUX)
  set(DCA_CLUSTER_SOLVER_TYPE "dca::phys::solver::CtauxClusterSolver<walker_device, ParametersType, DcaDataType>")
  set(DCA_CLUSTER_SOLVER_TYPE "dca::phys::solver::CtauxClusterSolver<walker_device, ParametersType, DcaDataType, DIST>")
  set(DCA_CLUSTER_SOLVER_INCLUDE
    "dca/phys/dca_step/cluster_solver/ctaux/ctaux_cluster_solver.hpp")

elseif (DCA_CLUSTER_SOLVER STREQUAL "SS-CT-HYB")
  set(DCA_CLUSTER_SOLVER_NAME dca::phys::solver::SS_CT_HYB)
  set(DCA_CLUSTER_SOLVER_TYPE "dca::phys::solver::SsCtHybClusterSolver<walker_device, ParametersType, DcaDataType>")
  set(DCA_CLUSTER_SOLVER_TYPE "dca::phys::solver::SsCtHybClusterSolver<walker_device, ParametersType, DcaDataType, DIST>")
  set(DCA_CLUSTER_SOLVER_INCLUDE
    "dca/phys/dca_step/cluster_solver/ss_ct_hyb/ss_ct_hyb_cluster_solver.hpp")

@@ -237,7 +237,7 @@ option(DCA_WITH_THREADED_SOLVER "Use multiple walker and accumulator threads in

if (DCA_WITH_THREADED_SOLVER)
  dca_add_config_define(DCA_WITH_THREADED_SOLVER)
  set(DCA_THREADED_SOLVER_TYPE dca::phys::solver::StdThreadQmciClusterSolver<ClusterSolverBaseType>)
  set(DCA_THREADED_SOLVER_TYPE dca::phys::solver::StdThreadQmciClusterSolver<ClusterSolverBaseType<DIST>>)
  set(DCA_THREADED_SOLVER_INCLUDE
      "dca/phys/dca_step/cluster_solver/stdthread_qmci/stdthread_qmci_cluster_solver.hpp")
endif()
+12 −3
Original line number Diff line number Diff line
@@ -23,7 +23,7 @@ include(CMakeParseArguments)
# MPI or CUDA may be given to indicate that the test requires these libraries. MPI_NUMPROC is the
# number of MPI processes to use for a test with MPI, the default value is 1.
function(dca_add_gtest name)
  set(options FAST EXTENSIVE STOCHASTIC PERFORMANCE GTEST_MAIN MPI CUDA)
  set(options FAST EXTENSIVE STOCHASTIC PERFORMANCE GTEST_MAIN MPI CUDA CUDA_CVD)
  set(oneValueArgs MPI_NUMPROC)
  set(multiValueArgs INCLUDE_DIRS SOURCES LIBS)
  cmake_parse_arguments(DCA_ADD_GTEST "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -81,6 +81,10 @@ function(dca_add_gtest name)
    return()
  endif()

  if (DCA_ADD_GTEST_CUDA_CVD AND NOT DCA_HAVE_CUDA)
    return()
  endif()

  add_executable(${name} ${name}.cpp ${DCA_ADD_GTEST_SOURCES})

  # Create a macro with the project source dir. We use this as the root path for reading files in
@@ -95,7 +99,7 @@ function(dca_add_gtest name)
    target_link_libraries(${name} gtest ${DCA_ADD_GTEST_LIBS})
  endif()

  if (DCA_ADD_GTEST_CUDA)
  if (DCA_ADD_GTEST_CUDA OR DCA_ADD_GTEST_CUDA_CVD)
    target_include_directories(${name} PRIVATE ${CUDA_TOOLKIT_INCLUDE})
    target_link_libraries(${name} ${DCA_CUDA_LIBS})
    target_compile_definitions(${name} PRIVATE DCA_HAVE_CUDA)
@@ -104,6 +108,11 @@ function(dca_add_gtest name)
      target_compile_definitions(${name} PRIVATE DCA_HAVE_MAGMA)
    endif()
    cuda_add_cublas_to_target(${name})
    # a less hacky way to do this would be good but this is used to test
    # development only feature distributed G4 at the moment.
    if (DCA_ADD_GTEST_CUDA_CVD)
      set(CVD_LAUNCHER "${PROJECT_SOURCE_DIR}/test/cvdlauncher.sh")
    endif()
  endif()

  target_include_directories(${name} PRIVATE
@@ -117,7 +126,7 @@ function(dca_add_gtest name)

    add_test(NAME ${name}
             COMMAND ${TEST_RUNNER} ${MPIEXEC_NUMPROC_FLAG} ${DCA_ADD_GTEST_MPI_NUMPROC}
                     ${MPIEXEC_PREFLAGS} ${SMPIARGS_FLAG_MPI} "$<TARGET_FILE:${name}>")
                     ${MPIEXEC_PREFLAGS} ${SMPIARGS_FLAG_MPI_CVD} ${CVD_LAUNCHER} "$<TARGET_FILE:${name}>")
                 target_link_libraries(${name} ${MPI_C_LIBRARIES})
  else()
    if (TEST_RUNNER)
+42 −0
Original line number Diff line number Diff line

// Copyright (C) 2020 ETH Zurich
// Copyright (C) 2020 UT-Battelle, LLC
// All rights reserved.
//
// See LICENSE for terms of usage.
// See CITATION.md for citation guidelines, if DCA++ is used for scientific publications.
//
// Author: Peter Doak (doakpw@ornl.gov)
//
// Dispatches based on runtime selectable template parameters.
// Obviously this must be limited as it results in combinatorial amounts of
// code being compiled.

#ifndef DCA_APPLICATION_DCA_LOOP_DISPATCH_HPP
#define DCA_APPLICATION_DCA_LOOP_DISPATCH_HPP
#include "dca/config/dca.hpp"

template <dca::DistType DT>
class DCALoopDispatch {
public:
  void operator()(ParametersType& parameters, DcaDataType& dca_data, Concurrency& concurrency) {
    DcaLoopType<DT> dca_loop(parameters, dca_data, concurrency);
    {
      Profiler profiler(__FUNCTION__, __FILE__, __LINE__);

      dca_loop.initialize();
      dca_loop.execute();
      dca_loop.finalize();
      Profiler::stop(concurrency, parameters.get_filename_profiling());

      if (concurrency.id() == concurrency.first()) {
        std::cout << "\nProcessor " << concurrency.id() << " is writing data." << std::endl;
        dca_loop.write();

        std::cout << "\nFinish time: " << dca::util::print_time() << "\n" << std::endl;
      }
    }
  }
};

#endif
Loading