Merge branch 'master' into ct_int-system_level_test (91a2cfcf) · Commits · NDIP / Tool Sources / Direct-Geometry Spectroscopy / DCA / DCA Main

build-aux/cades.cmake

0 → 100644

+57 −0

Original line number	Diff line number	Diff line
		# Initial cache list for cades
		#
		# Building on this cluster is very brittle due to slurm and bad system level modules?
		# Centos 7 in general?
		#
		# Spack generated hdf5 and magma seemed problematic so both are hand built.
		#
		# Don't expect this to work at all without sourcing
		# build-aux/cades_load_modules.sh
		#
		# Usage: cmake -C /path/to/this/file /path/to/DCA/source -D<option>=<value> -GNinja ...

		# Use srun for executing the tests.
		set(TEST_RUNNER "srun" CACHE STRING "Command for executing (MPI) programs.")
		set(MPIEXEC_NUMPROC_FLAG "-n" CACHE STRING
		"Flag used by TEST_RUNNER to specify the number of processes.")
		# Use 1 GPU and 64G memory per test process passing all tests will require running on 4 P100 nodes
		# i.e.
		# salloc -A ccsd -p gpu_p100 --nodes=4 --mem=180G --exclusive --gres=gpu:2 -t 00:30:00
		set(MPIEXEC_PREFLAGS "--mem=64G --gpus-per-task=1" CACHE STRING
		"Flags to pass to TEST_RUNNER directly before the executable to run.")

		# these aren't needed on cades.
		set(SMPIARGS_FLAG_NOMPI "" CACHE STRING
		"Spectrum MPI argument list flag for serial tests.")
		# Let's keep this option in case we need it again in the future.
		set(SMPIARGS_FLAG_MPI "" CACHE STRING "Spectrum MPI argument list flag for MPI tests.")

		# Enable the GPU support.
		option(DCA_WITH_CUDA "Enable GPU support." ON)
		option(DCA_WITH_CUDA_AWARE_MPI "Enable CUDA aware MPI." OFF)

		set(CUDA_TOOLKIT_ROOT_DIR $ENV{CUDA_DIR} CACHE PATH "path to CUDA toolkit")

		# Compile for Volta compute architecture.
		set(CUDA_GPU_ARCH "sm_60" CACHE STRING "Name of the real architecture to build for.")

		# Summit's static CUDA runtime is bugged.
		option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)

		# For the GPU support we also need MAGMA.
		set(MAGMA_DIR $ENV{MAGMA_DIR} CACHE PATH
		"Path to the MAGMA installation directory. Hint for CMake to find MAGMA.")

		# FFTW paths.
		set(FFTW_INCLUDE_DIR $ENV{FFTW_DIR}/include CACHE PATH "Path to fftw3.h.")
		set(FFTW_LIBRARY $ENV{FFTW_DIR}/lib/libfftw3.so CACHE FILEPATH "The FFTW3(-compatible) library.")

		# HDF5 paths
		set(HDF5_ROOT $ENV{HD5_DIR})
		set(HDF5_INCLUDE_DIRS $ENV{HDF5_DIR}/include CACHE PATH "Path to hdf5 includes")
		set(HDF5_LIBRARIES "$ENV{HDF5_DIR}/lib/libhdf5_cpp.a;$ENV{HDF5_DIR}/lib/libhdf5.a" CACHE FILEPATH "The hdf5 libraries")

		option(DCA_WITH_TESTS_FAST "Fast minimal tests" ON)

		#required by dependencies but not picked up by cmake for whatever reason.
		set(CMAKE_EXE_LINKER_FLAGS "-ldl -fopenmp" CACHE STRING "additional linking arguments needed")

build-aux/cades_load_modules.sh

0 → 100644

+30 −0

Original line number	Diff line number	Diff line
		# Spack modules to get DCA build on Cades

		# If you aren't using the suggested CNMS environment you need to uncomment the following two lines.

		# module load env/cades-cnms
		#. $SOFTWARECNMS/spack/share/spack/setup-env.sh

		module load PE-gnu/3.0
		spack load emacs@26.3
		spack load git
		spack load gcc@8.2.0
		spack load openmpi/qnfab5m
		spack load fftw%gcc@8.2.0
		spack load ninja/v2bqky4
		spack load cmake/g4ybxxf
		spack load openblas@0.3.9

		export HDF5_DIR=/software/user_tools/current/cades-cnms/for_nti/hdf5
		export MAGMA_DIR=/lustre/or-hydra/cades-cnms/epd/dev/magma
		export CUDA_DIR=/software/dev_tools/swtree/cs400_centos7.2_pe2016-08/cuda/11.0/centos7.8_binary
		export CUDADIR=/software/dev_tools/swtree/cs400_centos7.2_pe2016-08/cuda/11.0/centos7.8_binary
		export CMAKE_PREFIX_PATH=${HDF5_DIR}:${MAGMA_DIR}:$CMAKE_PREFIX_PATH

		export FFTW_DIR=`spack find --loaded -p fftw \| awk -e '/fftw/ {print $2}'`

		export CC=$(which mpicc)
		export CXX=$(which mpic++)

		# cmake like this can work if you don't want to use cades.cmake
		#rm -rf *; CXX=$(which mpic++) CC=$(which mpicc) cmake -DCUDA_TOOLKIT_ROOT_DIR=${CUDA_DIR} -DMAGMA_DIR=${MAGMA_DIR} -DDCA_WITH_CUDA=True -DCUDA_GPU_ARCH=sm_60 -DHDF5_ROOT=${HDF5_DIR} -DHDF5_INCLUDE_DIRS=${HDF5_DIR}/include -DHDF5_LIBRARIES="${HDF5_DIR}/lib/libhdf5_cpp.a;${HDF5_DIR}/lib/libhdf5.a" -DDCA_WITH_TESTS_FAST=True -DTEST_RUNNER=srun -DMPIEXEC_NUMPROC_FLAG="-n" -DMPIEXEC_PREFLAGS="--mem=64G --gpus-per-task=1" -DCMAKE_EXE_LINKER_FLAGS="-ldl -fopenmp" -DCMAKE_EXPORT_COMPILE_COMMANDS=1 -DFFTW_DIR=${FFTW_DIR} -DFFTW_INCLUDE_DIR=${FFTW_DIR}/include -DFFTW_LIBRARY="${FFTW_DIR}/lib/libfftw3.a;${FFTW_DIR}/lib/libfftw3f.a" -GNinja ..

cmake/dca_config.cmake

+10 −2

Original line number	Diff line number	Diff line
		@@ -134,8 +134,8 @@ configure_file("${PROJECT_SOURCE_DIR}/include/dca/config/lattice_model.hpp.in"

		################################################################################
		# Select the profiler type and enable auto-tuning.
		set(DCA_PROFILER "None" CACHE STRING "Profiler type, options are: None \| Counting \| PAPI.")
		set_property(CACHE DCA_PROFILER PROPERTY STRINGS None Counting PAPI)
		set(DCA_PROFILER "None" CACHE STRING "Profiler type, options are: None \| Counting \| PAPI \| Cuda.")
		set_property(CACHE DCA_PROFILER PROPERTY STRINGS None Counting PAPI Cuda)

		if (DCA_PROFILER STREQUAL "Counting")
		set(DCA_PROFILING_EVENT_TYPE dca::profiling::time_event<std::size_t>)
		@@ -149,6 +149,14 @@ elseif (DCA_PROFILER STREQUAL "PAPI")
		set(DCA_PROFILER_TYPE dca::profiling::CountingProfiler<Event>)
		set(DCA_PROFILER_INCLUDE "dca/profiling/counting_profiler.hpp")

		# Note: this profiler requires using the PTHREAD library and CUDA_TOOLS_EXT_LIBRARY
		elseif (DCA_PROFILER STREQUAL "Cuda")
		set(DCA_PROFILING_EVENT_INCLUDE "dca/profiling/events/time.hpp")
		set(DCA_PROFILING_EVENT_TYPE "void")
		set(DCA_PROFILER_TYPE dca::profiling::CudaProfiler)
		set(DCA_PROFILER_INCLUDE "dca/profiling/cuda_profiler.hpp")
		link_libraries(${CUDA_nvToolsExt_LIBRARY})

		else() # DCA_PROFILER = None
		# The NullProfiler doesn't have an event type.
		set(DCA_PROFILING_EVENT_TYPE void)

include/dca/linalg/matrix.hpp

+0 −14

Original line number	Diff line number	Diff line
		@@ -194,11 +194,6 @@ public:
		// Swaps the contents of the matrix, included the name, with those of rhs.
		void swapWithName(Matrix<ScalarType, device_name>& rhs);

		// Asynchronous assignment (copy with stream = getStream(thread_id, stream_id))
		// + synchronization of stream
		template <DeviceType rhs_device_name>
		void set(const Matrix<ScalarType, rhs_device_name>& rhs, int thread_id, int stream_id);

		#ifdef DCA_HAVE_CUDA
		// Asynchronous assignment.
		template <DeviceType rhs_device_name>
		@@ -418,15 +413,6 @@ void Matrix<ScalarType, device_name>::swapWithName(Matrix<ScalarType, device_nam
		swap(rhs);
		}

		template <typename ScalarType, DeviceType device_name>
		template <DeviceType rhs_device_name>
		void Matrix<ScalarType, device_name>::set(const Matrix<ScalarType, rhs_device_name>& rhs,
		int thread_id, int stream_id) {
		resize(rhs.size_);
		util::memoryCopy(data_, leadingDimension(), rhs.data_, rhs.leadingDimension(), size_, thread_id,
		stream_id);
		}

		#ifdef DCA_HAVE_CUDA

		template <typename ScalarType, DeviceType device_name>

include/dca/phys/dca_step/cluster_solver/ctaux/ctaux_walker.hpp

+39 −42

Original line number	Diff line number	Diff line
		@@ -133,7 +133,7 @@ public:
		}

		private:
		void add_non_interacting_spins_to_configuration();
		void addNonInteractingSpinsToMatrices();

		void generate_delayed_spins(int& single_spin_updates_todo);

		@@ -307,6 +307,8 @@ private:
		int warm_up_sweeps_done_;
		util::Accumulator<std::size_t> warm_up_expansion_order_;
		util::Accumulator<std::size_t> num_delayed_spins_;
		int currently_proposed_creations_ = 0;
		int currently_proposed_annihilations_ = 0;

		// std::array<linalg::Matrix<Real, device_t>, 2> M_;
		std::array<linalg::Vector<Real, linalg::CPU>, 2> exp_v_minus_one_;
		@@ -316,6 +318,8 @@ private:
		bool config_initialized_;

		double sweeps_per_measurement_ = 1.;

		linalg::util::CudaEvent sync_streams_event_;
		};

		template <dca::linalg::DeviceType device_t, class Parameters, class Data, typename Real>
		@@ -521,10 +525,12 @@ void CtauxWalker<device_t, Parameters, Data, Real>::doSweep() {

		template <dca::linalg::DeviceType device_t, class Parameters, class Data, typename Real>
		void CtauxWalker<device_t, Parameters, Data, Real>::doStep(int& single_spin_updates_todo) {
		add_non_interacting_spins_to_configuration();
		configuration_.prepare_configuration();

		generate_delayed_spins(single_spin_updates_todo);

		addNonInteractingSpinsToMatrices();

		download_from_device();

		compute_Gamma_matrices();
		@@ -598,15 +604,12 @@ std::enable_if_t<dev_t == device_t && device_t == dca::linalg::CPU, void> CtauxW
		}

		template <dca::linalg::DeviceType device_t, class Parameters, class Data, typename Real>
		void CtauxWalker<device_t, Parameters, Data, Real>::add_non_interacting_spins_to_configuration() {
		void CtauxWalker<device_t, Parameters, Data, Real>::addNonInteractingSpinsToMatrices() {
		Profiler profiler(__FUNCTION__, "CT-AUX walker", __LINE__, thread_id);

		Gamma_up.resizeNoCopy(0);
		Gamma_dn.resizeNoCopy(0);

		// shuffle the configuration + do some configuration checks
		configuration_.shuffle_noninteracting_vertices();

		{ // update G0 for new shuffled vertices
		Profiler p("G0-matrix (update)", "CT-AUX walker", __LINE__, thread_id);

		@@ -617,18 +620,6 @@ void CtauxWalker<device_t, Parameters, Data, Real>::add_non_interacting_spins_to
		check_G0_matrices(configuration_, G0_up, G0_dn);
		#endif // DCA_WITH_QMC_BIT
		}

		/*
		if(true)
		{
		std::cout << "\n\n\t G0-TOOLS \n\n";
		G0_CPU_tools_obj.build_G0_matrix(configuration, G0_up_CPU, e_UP);
		G0_CPU_tools_obj.build_G0_matrix(configuration, G0_dn_CPU, e_DN);
		dca::linalg::matrixop::difference(G0_up_CPU, G0_up);
		dca::linalg::matrixop::difference(G0_dn_CPU, G0_dn);
		}
		*/

		{ // update N for new shuffled vertices
		Profiler p("N-matrix (update)", "CT-AUX walker", __LINE__, thread_id);

		@@ -664,6 +655,7 @@ void CtauxWalker<device_t, Parameters, Data, Real>::generate_delayed_spins(
		? generateDelayedSpinsNeglectBennett(single_spin_updates_todo)
		: generateDelayedSpinsAbortAtBennett(single_spin_updates_todo);

		// assert(single_spin_updates_proposed > 0);
		single_spin_updates_todo -= single_spin_updates_proposed;
		assert(single_spin_updates_todo >= 0);

		@@ -679,15 +671,15 @@ int CtauxWalker<device_t, Parameters, Data, Real>::generateDelayedSpinsAbortAtBe
		assert(single_spin_updates_todo > 0);

		const auto max_num_delayed_spins = parameters_.get_max_submatrix_size();
		const auto num_non_interacting_spins_initial = configuration_.get_number_of_creatable_HS_spins();

		delayed_spins.resize(0);

		int num_creations = 0;
		int num_annihilations = 0;
		int num_statics = 0;
		int single_spin_updates_proposed = 0;

		currently_proposed_annihilations_ = 0;
		currently_proposed_creations_ = 0;

		// Do the aborted annihilation proposal.
		if (annihilation_proposal_aborted_) {
		delayed_spin_struct delayed_spin;
		@@ -704,7 +696,7 @@ int CtauxWalker<device_t, Parameters, Data, Real>::generateDelayedSpinsAbortAtBe
		delayed_spin.new_HS_spin_value = HS_ZERO;

		delayed_spins.push_back(delayed_spin);
		++num_annihilations;
		++currently_proposed_annihilations_;
		}

		// Propose removal of a different vertex or do a static step if the configuration_ is empty.
		@@ -717,7 +709,7 @@ int CtauxWalker<device_t, Parameters, Data, Real>::generateDelayedSpinsAbortAtBe
		delayed_spin.new_HS_spin_value = HS_ZERO;

		delayed_spins.push_back(delayed_spin);
		++num_annihilations;
		++currently_proposed_annihilations_;
		}

		else {
		@@ -730,8 +722,7 @@ int CtauxWalker<device_t, Parameters, Data, Real>::generateDelayedSpinsAbortAtBe
		}

		// Generate more delayed spins.
		while (!annihilation_proposal_aborted_ && num_creations < num_non_interacting_spins_initial &&
		single_spin_updates_proposed < single_spin_updates_todo &&
		while (!annihilation_proposal_aborted_ && single_spin_updates_proposed < single_spin_updates_todo &&
		delayed_spins.size() < max_num_delayed_spins) {
		delayed_spin_struct delayed_spin;
		delayed_spin.is_accepted_move = false;
		@@ -757,17 +748,18 @@ int CtauxWalker<device_t, Parameters, Data, Real>::generateDelayedSpinsAbortAtBe

		if (!annihilation_proposal_aborted_) {
		delayed_spins.push_back(delayed_spin);
		++num_annihilations;
		++currently_proposed_annihilations_;
		++single_spin_updates_proposed;
		}
		}

		else if (delayed_spin.HS_current_move == CREATION) {
		delayed_spin.random_vertex_ind = configuration_.get_random_noninteracting_vertex(true);
		delayed_spin.random_vertex_ind = configuration_.size();
		configuration_.insert_random_noninteracting_vertex(true);
		delayed_spin.new_HS_spin_value = rng() > 0.5 ? HS_UP : HS_DN;

		delayed_spins.push_back(delayed_spin);
		++num_creations;
		++currently_proposed_creations_;
		++single_spin_updates_proposed;
		}

		@@ -777,7 +769,6 @@ int CtauxWalker<device_t, Parameters, Data, Real>::generateDelayedSpinsAbortAtBe
		++single_spin_updates_proposed;
		}
		}

		// We need to unmark all "virtual" interacting spins, that we have temporarily marked as
		// annihilatable in CT_AUX_HS_configuration::get_random_noninteracting_vertex().
		// TODO: Eliminate the need to mark and unmark these spins.
		@@ -785,7 +776,8 @@ int CtauxWalker<device_t, Parameters, Data, Real>::generateDelayedSpinsAbortAtBe
		if (spin.HS_current_move == CREATION)
		configuration_.unmarkAsAnnihilatable(spin.random_vertex_ind);

		assert(single_spin_updates_proposed == num_creations + num_annihilations + num_statics);
		assert(single_spin_updates_proposed ==
		currently_proposed_creations_ + currently_proposed_annihilations_ + num_statics);

		return single_spin_updates_proposed;
		}
		@@ -796,18 +788,18 @@ int CtauxWalker<device_t, Parameters, Data, Real>::generateDelayedSpinsNeglectBe
		assert(single_spin_updates_todo > 0);

		const auto max_num_delayed_spins = parameters_.get_max_submatrix_size();
		const auto num_non_interacting_spins_initial = configuration_.get_number_of_creatable_HS_spins();
		const auto num_interacting_spins_initial = configuration_.get_number_of_interacting_HS_spins();

		delayed_spins.resize(0);

		int num_creations = 0;
		int num_annihilations = 0;
		int num_statics = 0;
		int single_spin_updates_proposed = 0;
		int num_statics = 0;

		while ((num_interacting_spins_initial == 0 \|\| num_annihilations < num_interacting_spins_initial) &&
		num_creations < num_non_interacting_spins_initial &&
		currently_proposed_annihilations_ = 0;
		currently_proposed_creations_ = 0;

		while ((num_interacting_spins_initial == 0 \|\|
		currently_proposed_annihilations_ < num_interacting_spins_initial) &&
		single_spin_updates_proposed < single_spin_updates_todo &&
		delayed_spins.size() < max_num_delayed_spins) {
		delayed_spin_struct delayed_spin;
		@@ -832,16 +824,17 @@ int CtauxWalker<device_t, Parameters, Data, Real>::generateDelayedSpinsNeglectBe
		}

		delayed_spins.push_back(delayed_spin);
		++num_annihilations;
		++currently_proposed_annihilations_;
		++single_spin_updates_proposed;
		}

		else if (delayed_spin.HS_current_move == CREATION) {
		delayed_spin.random_vertex_ind = configuration_.get_random_noninteracting_vertex(false);
		delayed_spin.random_vertex_ind = configuration_.size();
		configuration_.insert_random_noninteracting_vertex(false);
		delayed_spin.new_HS_spin_value = rng() > 0.5 ? HS_UP : HS_DN;

		delayed_spins.push_back(delayed_spin);
		++num_creations;
		++currently_proposed_creations_;
		++single_spin_updates_proposed;
		}

		@@ -852,7 +845,8 @@ int CtauxWalker<device_t, Parameters, Data, Real>::generateDelayedSpinsNeglectBe
		}
		}

		assert(single_spin_updates_proposed == num_creations + num_annihilations + num_statics);
		assert(single_spin_updates_proposed ==
		currently_proposed_creations_ + currently_proposed_annihilations_ + num_statics);

		return single_spin_updates_proposed;
		}
		@@ -1071,8 +1065,7 @@ void CtauxWalker<device_t, Parameters, Data, Real>::add_delayed_spins_to_the_con
		configuration_.add_delayed_HS_spin(configuration_index, delayed_spins[i].new_HS_spin_value);
		}
		else {
		configuration_[configuration_index].is_creatable() = false;
		configuration_[configuration_index].is_annihilatable() = false;
		configuration_[configuration_index].set_annihilatable(false);
		}
		}
		}
		@@ -1550,6 +1543,10 @@ template <dca::linalg::DeviceType device_t, class Parameters, class Data, typena
		template <typename AccumType>
		const linalg::util::CudaEvent* CtauxWalker<device_t, Parameters, Data, Real>::computeM(
		std::array<linalg::Matrix<AccumType, device_t>, 2>& Ms) {
		// Stream 1 waits on stream 0.
		sync_streams_event_.record(linalg::util::getStream(thread_id, 0));
		sync_streams_event_.block(linalg::util::getStream(thread_id, 1));

		for (int s = 0; s < 2; ++s) {
		const auto& config = get_configuration().get(s == 0 ? e_UP : e_DN);
		exp_v_minus_one_[s].resizeNoCopy(config.size());