Merge pull request #213 from gbalduzz/optional_cuda_aware_mpi (8c956625) · Commits · NDIP / Tool Sources / Direct-Geometry Spectroscopy / DCA / DCA Main

CMakeLists.txt

+0 −6

Original line number	Diff line number	Diff line
		@@ -147,11 +147,7 @@ set(DCA_LIBS
		cuda_utils
		)

		set(SYSTEM_GPU_COUNT 0)

		if (DCA_HAVE_CUDA)
		EXECUTE_PROCESS(COMMAND bash -c "nvidia-smi -L \| awk 'BEGIN { num_gpu=0;} /GPU/ { num_gpu++;} END { printf(\"%d\", num_gpu) }'"
		OUTPUT_VARIABLE SYSTEM_GPU_COUNT)
		list(APPEND DCA_LIBS
		blas_kernels
		dnfft_kernels
		@@ -171,8 +167,6 @@ option(DCA_WITH_TESTS_EXTENSIVE "Build DCA++'s extensive tests." OFF)
		option(DCA_WITH_TESTS_PERFORMANCE "Build DCA++'s performance tests. (Only in Release mode.)" OFF)
		option(DCA_WITH_TESTS_STOCHASTIC "Build DCA++'s stochastic tests." OFF)

		set(DCA_TEST_GPU_COUNT "${SYSTEM_GPU_COUNT}" CACHE INTEGER "Number of GPUs available on one node for one test.")

		set(TEST_RUNNER "" CACHE STRING "Command for executing (MPI) programs.")
		set(MPIEXEC_NUMPROC_FLAG "-n" CACHE STRING "Flag used by TEST_RUNNER to specify the number of processes.")
		set(MPIEXEC_PREFLAGS "" CACHE STRING "Flags to pass to TEST_RUNNER directly before the executable to run.")

build-aux/summit.cmake

+2 −5

Original line number	Diff line number	Diff line
		@@ -23,14 +23,11 @@ set(MPIEXEC_PREFLAGS "-a 1 -g 1 -c 5" CACHE STRING
		set(SMPIARGS_FLAG_NOMPI "--smpiargs=none" CACHE STRING
		"Spectrum MPI argument list flag for serial tests.")
		# Let's keep this option in case we need it again in the future.
		set(SMPIARGS_FLAG_MPI "" CACHE STRING "Spectrum MPI argument list flag for MPI tests.")

		# When we want to us a cuda visible devices restriction we need this flag
		set(SMPIARGS_FLAG_MPI_CVD "--smpiargs=-gpu" CACHE STRING
		"Spectrum MPI argument list for cuda-mpi tests")
		set(SMPIARGS_FLAG_MPI "--smpiargs=\"-gpu\"" CACHE STRING "Spectrum MPI argument list flag for MPI tests.")

		# Enable the GPU support.
		option(DCA_WITH_CUDA "Enable GPU support." ON)
		option(DCA_WITH_CUDA_AWARE_MPI "Enable CUDA aware MPI." ON)

		# Compile for Volta compute architecture.
		set(CUDA_GPU_ARCH "sm_70" CACHE STRING "Name of the real architecture to build for.")

cmake/dca_cuda.cmake

+9 −0

Original line number	Diff line number	Diff line
		@@ -20,6 +20,10 @@ if (CUDA_FOUND)
		list(APPEND DCA_CUDA_LIBS ${CUDA_LIBRARIES} ${CUDA_cusparse_LIBRARY} ${CUDA_cublas_LIBRARY})
		CUDA_INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS})
		set(CUDA_SEPARABLE_COMPILATION ON)

		set(CVD_LAUNCHER "" CACHE INTERNAL "launch script for setting the Cuda visible devices.")
		# Use the following script for systems with multiple gpus visible from a rank.
		# set(CVD_LAUNCHER "test/cvd_launcher.sh" CACHE INTERNAL "")
		endif()

		# Find MAGMA.
		@@ -48,4 +52,9 @@ endif()
		if (CUDA_FOUND AND DCA_HAVE_MAGMA)
		set(DCA_HAVE_CUDA TRUE CACHE INTERNAL "")
		dca_add_haves_define(DCA_HAVE_CUDA)

		option(DCA_WITH_CUDA_AWARE_MPI "Enable CUDA aware MPI." OFF)
		if(DCA_WITH_CUDA_AWARE_MPI)
		dca_add_haves_define(DCA_HAVE_CUDA_AWARE_MPI)
		endif()
		endif()

cmake/dca_testing.cmake

+5 −17

Original line number	Diff line number	Diff line
		@@ -24,7 +24,7 @@ include(CMakeParseArguments)
		# MPI or CUDA may be given to indicate that the test requires these libraries. MPI_NUMPROC is the
		# number of MPI processes to use for a test with MPI, the default value is 1.
		function(dca_add_gtest name)
		set(options FAST EXTENSIVE STOCHASTIC PERFORMANCE GTEST_MAIN MPI CUDA CUDA_CVD)
		set(options FAST EXTENSIVE STOCHASTIC PERFORMANCE GTEST_MAIN MPI CUDA)
		set(oneValueArgs MPI_NUMPROC)
		set(multiValueArgs INCLUDE_DIRS SOURCES LIBS)
		cmake_parse_arguments(DCA_ADD_GTEST "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
		@@ -82,14 +82,6 @@ function(dca_add_gtest name)
		return()
		endif()

		if (DCA_ADD_GTEST_CUDA_CVD AND NOT DCA_HAVE_CUDA )
		return()
		endif()

		if (DCA_ADD_GTEST_CUDA_CVD AND (DCA_TEST_GPU_COUNT LESS 3) )
		return()
		endif()

		add_executable(${name} ${name}.cpp ${DCA_ADD_GTEST_SOURCES})

		# Create a macro with the project source dir. We use this as the root path for reading files in
		@@ -104,7 +96,7 @@ function(dca_add_gtest name)
		target_link_libraries(${name} gtest ${DCA_ADD_GTEST_LIBS})
		endif()

		if (DCA_ADD_GTEST_CUDA OR DCA_ADD_GTEST_CUDA_CVD)
		if (DCA_ADD_GTEST_CUDA)
		target_include_directories(${name} PRIVATE ${CUDA_TOOLKIT_INCLUDE})
		target_link_libraries(${name} ${DCA_CUDA_LIBS})
		target_compile_definitions(${name} PRIVATE DCA_HAVE_CUDA)
		@@ -113,11 +105,6 @@ function(dca_add_gtest name)
		target_compile_definitions(${name} PRIVATE DCA_HAVE_MAGMA)
		endif()
		cuda_add_cublas_to_target(${name})
		# a less hacky way to do this would be good but this is used to test
		# development only feature distributed G4 at the moment.
		if (DCA_ADD_GTEST_CUDA_CVD)
		set(CVD_LAUNCHER "${PROJECT_SOURCE_DIR}/test/cvdlauncher.sh")
		endif()
		endif()

		target_include_directories(${name} PRIVATE
		@@ -131,13 +118,14 @@ function(dca_add_gtest name)

		add_test(NAME ${name}
		COMMAND ${TEST_RUNNER} ${MPIEXEC_NUMPROC_FLAG} ${DCA_ADD_GTEST_MPI_NUMPROC}
		${MPIEXEC_PREFLAGS} ${SMPIARGS_FLAG_MPI_CVD} ${CVD_LAUNCHER} "$<TARGET_FILE:${name}>")
		${MPIEXEC_PREFLAGS} ${CVD_LAUNCHER} "$<TARGET_FILE:${name}>")
		target_link_libraries(${name} ${MPI_C_LIBRARIES})
		else()
		if (TEST_RUNNER)
		add_test(NAME ${name}
		COMMAND ${TEST_RUNNER} ${MPIEXEC_NUMPROC_FLAG} 1
		${MPIEXEC_PREFLAGS} ${SMPIARGS_FLAG_NOMPI} "$<TARGET_FILE:${name}>")
		${MPIEXEC_PREFLAGS} ${SMPIARGS_FLAG_NOMPI}
		${CVD_LAUNCHER} "$<TARGET_FILE:${name}>")
		else (TEST_RUNNER)
		add_test(NAME ${name}
		COMMAND "$<TARGET_FILE:${name}>")

include/dca/function/function.hpp

+110 −114

Original line number	Diff line number	Diff line
		@@ -30,15 +30,10 @@
		#include "dca/distribution/dist_types.hpp"
		#include "dca/function/scalar_cast.hpp"
		#include "dca/function/set_to_zero.hpp"
		#include "dca/util/ignore.hpp"
		#include "dca/util/pack_operations.hpp"
		#include "dca/util/integer_division.hpp"
		#include "dca/util/type_utils.hpp"

		#include "dca/parallel/util/get_workload.hpp"
		#ifdef DCA_HAVE_MPI
		#include "mpi.h"
		#endif

		namespace dca {
		namespace func {
		// dca::func::
		@@ -54,9 +49,11 @@ public:
		// Default constructor
		// Constructs the function with the name name.
		// Postcondition: All elements are set to zero.
		// Special case: when distributed_g4_enabled, G4 related variables only gets
		// allocation of 1/p of original G4 size, where p = #mpiranks
		function(const std::string& name = default_name_, const DistType dist = DistType::NONE);
		function(const std::string& name = default_name_);

		// Distributed function. Access with multi-index operator() is not safe.
		template <class Concurrency>
		function(const std::string& name, const Concurrency& concurrency);

		// Copy constructor
		// Constructs the function with the a copy of elements and name of other.
		@@ -97,8 +94,6 @@ public:
		// The other function is in a non-specified state.
		function<scalartype, domain>& operator=(function<scalartype, domain>&& other);

		~function();

		// Resets the function by resetting the domain object and reallocating the memory for the function
		// elements.
		// Postcondition: All elements are set to zero.
		@@ -118,11 +113,12 @@ public:
		return Nb_sbdms;
		}
		std::size_t size() const {
		return nb_elements_;
		return fnc_values_.size();
		}

		// TODO: remove as it breaks class' invariant.
		void resize(std::size_t nb_elements_new) {
		nb_elements_ = nb_elements_new;
		fnc_values_.resize(nb_elements_new);
		}
		// Returns the size of the leaf domain with the given index.
		// Does not return function values!
		@@ -131,31 +127,31 @@ public:
		}

		// Begin and end methods for compatibility with range for loop.
		scalartype* begin() {
		return fnc_values;
		auto begin() {
		return fnc_values_.begin();
		}
		scalartype* end() {
		return fnc_values + nb_elements_;
		auto end() {
		return fnc_values_.end();
		}
		const scalartype* begin() const {
		return fnc_values;
		const auto begin() const {
		return fnc_values_.begin();
		}
		const scalartype* end() const {
		return fnc_values + nb_elements_;
		const auto end() const {
		return fnc_values_.end();
		}

		// Returns a pointer to the function's elements.
		scalartype* values() {
		return fnc_values;
		return fnc_values_.data();
		}
		const scalartype* values() const {
		return fnc_values;
		return fnc_values_.data();
		}
		scalartype* data() {
		return fnc_values;
		return fnc_values_.data();
		}
		const scalartype* data() const {
		return fnc_values;
		return fnc_values_.data();
		}

		//
		@@ -197,7 +193,7 @@ public:
		template <typename T>
		int subind_2_linind(const T ind) const {
		static_assert(std::is_integral<T>::value, "Index ind must be an integer.");
		assert(ind >= 0 && ind < nb_elements_);
		assert(ind >= 0 && ind < size());
		return ind;
		}

		@@ -211,24 +207,24 @@ public:
		template <typename T>
		scalartype& operator()(const T linind) {
		static_assert(std::is_integral<T>::value, "Index linind must be an integer.");
		assert(linind >= 0 && linind < nb_elements_);
		return fnc_values[linind];
		assert(linind >= 0 && linind < size());
		return fnc_values_[linind];
		}
		template <typename T>
		const scalartype& operator()(const T linind) const {
		static_assert(std::is_integral<T>::value, "Index linind must be an integer.");
		assert(linind >= 0 && linind < nb_elements_);
		return fnc_values[linind];
		assert(linind >= 0 && linind < size());
		return fnc_values_[linind];
		}

		template <typename... Ts>
		scalartype& operator()(const Ts... subindices) {
		// We need to cast all indices to the same type for dmn_variadic.
		return fnc_values[dmn(static_cast<int>(subindices)...)];
		return fnc_values_[dmn(static_cast<int>(subindices)...)];
		}
		template <typename... Ts>
		const scalartype& operator()(const Ts... subindices) const {
		return fnc_values[dmn(static_cast<int>(subindices)...)];
		return fnc_values_[dmn(static_cast<int>(subindices)...)];
		}

		void operator+=(const function<scalartype, domain>& other);
		@@ -243,7 +239,7 @@ public:
		void operator/=(scalartype c);

		// Equal-comparison opertor
		// Returns true if the function's elements (fnc_values) are equal to other's elements, false
		// Returns true if the function's elements (fnc_values_) are equal to other's elements, false
		// otherwise.
		// TODO: Make the equal-comparison operator a non-member function.
		bool operator==(const function<scalartype, domain>& other) const;
		@@ -276,47 +272,58 @@ public:
		template <class concurrency_t>
		void unpack(const concurrency_t& concurrency, char* buffer, int buffer_size, int& position);

		// Gather a function that was initialized as distributed.
		// Precondition: concurrency must be the same object used during construction.
		template <class Concurrency>
		function gather(const Concurrency& concurrency) const;

		private:
		std::string name_;
		std::string function_type;

		domain dmn; // TODO: Remove domain object?

		std::size_t nb_elements_;

		// The subdomains (sbdmn) represent the leaf domains, not the branch domains.
		int Nb_sbdms;
		const std::vector<std::size_t>& size_sbdm; // TODO: Remove?
		const std::vector<std::size_t>& step_sbdm; // TODO: Remove?

		scalartype* fnc_values;
		std::vector<scalartype> fnc_values_;
		};

		template <typename scalartype, class domain>
		const std::string function<scalartype, domain>::default_name_ = "no-name";

		template <typename scalartype, class domain>
		function<scalartype, domain>::function(const std::string& name, DistType dist)
		function<scalartype, domain>::function(const std::string& name)
		: name_(name),
		function_type(__PRETTY_FUNCTION__),
		dmn(),
		nb_elements_(dmn.get_size()),
		Nb_sbdms(dmn.get_leaf_domain_sizes().size()),
		size_sbdm(dmn.get_leaf_domain_sizes()),
		step_sbdm(dmn.get_leaf_domain_steps()),
		fnc_values(nullptr) {
		dca::util::ignoreUnused(dist);
		#ifdef DCA_HAVE_MPI
		if (dist == DistType::MPI) {
		int my_rank, mpi_size;
		MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
		MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
		nb_elements_ = dca::parallel::util::getWorkload(dmn.get_size(), mpi_size, my_rank);
		fnc_values_(dmn.get_size()) {
		for (int linind = 0; linind < size(); ++linind)
		setToZero(fnc_values_[linind]);
		}
		#endif // DCA_HAVE_MPI
		fnc_values = new scalartype[nb_elements_];
		for (int linind = 0; linind < nb_elements_; ++linind)
		setToZero(fnc_values[linind]);

		template <typename scalartype, class domain>
		template <class Concurrency>
		function<scalartype, domain>::function(const std::string& name, const Concurrency& concurrency)
		: name_(name),
		function_type(__PRETTY_FUNCTION__),
		dmn(),
		Nb_sbdms(dmn.get_leaf_domain_sizes().size()),
		size_sbdm(dmn.get_leaf_domain_sizes()),
		step_sbdm(dmn.get_leaf_domain_steps()) {
		// TODO: multi-index access to partitioned function is not safe.
		const std::size_t mpi_size = concurrency.number_of_processors();

		const std::size_t nb_elements = dca::util::ceilDiv(dmn.get_size(), mpi_size);
		fnc_values_.resize(nb_elements);

		for (int linind = 0; linind < nb_elements; ++linind)
		setToZero(fnc_values_[linind]);
		}

		template <typename scalartype, class domain>
		@@ -324,17 +331,13 @@ function<scalartype, domain>::function(const function<scalartype, domain>& other
		: name_(other.name_),
		function_type(__PRETTY_FUNCTION__),
		dmn(),
		nb_elements_(dmn.get_size()),
		Nb_sbdms(dmn.get_leaf_domain_sizes().size()),
		size_sbdm(dmn.get_leaf_domain_sizes()),
		step_sbdm(dmn.get_leaf_domain_steps()),
		fnc_values(nullptr) {
		fnc_values_(other.fnc_values_) {
		if (dmn.get_size() != other.dmn.get_size())
		// The other function has not been resetted after the domain was initialized.
		throw std::logic_error("Copy construction from a not yet resetted function.");

		fnc_values = new scalartype[nb_elements_];
		std::copy_n(other.fnc_values, nb_elements_, fnc_values);
		}

		template <typename scalartype, class domain>
		@@ -342,18 +345,13 @@ function<scalartype, domain>::function(function<scalartype, domain>&& other)
		: name_(std::move(other.name_)),
		function_type(__PRETTY_FUNCTION__),
		dmn(),
		nb_elements_(dmn.get_size()),
		Nb_sbdms(dmn.get_leaf_domain_sizes().size()),
		size_sbdm(dmn.get_leaf_domain_sizes()),
		step_sbdm(dmn.get_leaf_domain_steps()),
		fnc_values(nullptr) {
		fnc_values_(std::move(other.fnc_values_)) {
		if (dmn.get_size() != other.dmn.get_size())
		// The other function has not been resetted after the domain was initialized.
		throw std::logic_error("Move construction from a not yet resetted function.");

		fnc_values = other.fnc_values;
		other.nb_elements_ = 0;
		other.fnc_values = nullptr;
		}

		template <typename scalartype, class domain>
		@@ -370,7 +368,7 @@ function<scalartype, domain>& function<scalartype, domain>::operator=(
		throw std::logic_error("Copy assignment from a not yet resetted function.");
		}

		std::copy_n(other.values(), nb_elements_, fnc_values);
		fnc_values_ = other.fnc_values_;
		}

		return *this;
		@@ -383,7 +381,7 @@ function<Scalar, domain>& function<Scalar, domain>::operator=(const function<Sca
		throw(std::logic_error("Function size does not match."));
		}

		std::copy_n(other.values(), nb_elements_, fnc_values);
		fnc_values_ = other.fnc_values_;

		return *this;
		}
		@@ -402,33 +400,21 @@ function<scalartype, domain>& function<scalartype, domain>::operator=(
		throw std::logic_error("Move assignment from a not yet resetted function.");
		}

		delete[] fnc_values;
		fnc_values = other.fnc_values;

		other.nb_elements_ = 0;
		other.fnc_values = nullptr;
		fnc_values_ = std::move(other.fnc_values_);
		}

		return *this;
		}

		template <typename scalartype, class domain>
		function<scalartype, domain>::~function() {
		delete[] fnc_values;
		}

		template <typename scalartype, class domain>
		void function<scalartype, domain>::reset() {
		dmn.reset();

		nb_elements_ = dmn.get_size();
		fnc_values_.resize(dmn.get_size());
		Nb_sbdms = dmn.get_leaf_domain_sizes().size();

		delete[] fnc_values;
		fnc_values = new scalartype[nb_elements_];

		for (int linind = 0; linind < nb_elements_; ++linind)
		setToZero(fnc_values[linind]);
		for (int linind = 0; linind < size(); ++linind)
		setToZero(fnc_values_[linind]);
		}

		template <typename scalartype, class domain>
		@@ -480,8 +466,8 @@ scalartype& function<scalartype, domain>::operator()(const int* const subind) {
		int linind;
		subind_2_linind(subind, linind);

		assert(linind >= 0 && linind < nb_elements_);
		return fnc_values[linind];
		assert(linind >= 0 && linind < size());
		return fnc_values_[linind];
		}

		template <typename scalartype, class domain>
		@@ -489,64 +475,64 @@ const scalartype& function<scalartype, domain>::operator()(const int* const subi
		int linind;
		subind_2_linind(subind, linind);

		assert(linind >= 0 && linind < nb_elements_);
		return fnc_values[linind];
		assert(linind >= 0 && linind < size());
		return fnc_values_[linind];
		}

		template <typename scalartype, class domain>
		void function<scalartype, domain>::operator+=(const function<scalartype, domain>& other) {
		for (int linind = 0; linind < nb_elements_; ++linind)
		fnc_values[linind] += other(linind);
		for (int linind = 0; linind < size(); ++linind)
		fnc_values_[linind] += other(linind);
		}

		template <typename scalartype, class domain>
		void function<scalartype, domain>::operator-=(const function<scalartype, domain>& other) {
		for (int linind = 0; linind < nb_elements_; ++linind)
		fnc_values[linind] -= other(linind);
		for (int linind = 0; linind < size(); ++linind)
		fnc_values_[linind] -= other(linind);
		}

		template <typename scalartype, class domain>
		void function<scalartype, domain>::operator*=(const function<scalartype, domain>& other) {
		for (int linind = 0; linind < nb_elements_; ++linind)
		fnc_values[linind] *= other(linind);
		for (int linind = 0; linind < size(); ++linind)
		fnc_values_[linind] *= other(linind);
		}

		template <typename scalartype, class domain>
		void function<scalartype, domain>::operator/=(const function<scalartype, domain>& other) {
		for (int linind = 0; linind < nb_elements_; ++linind) {
		for (int linind = 0; linind < size(); ++linind) {
		assert(std::abs(other(linind)) > 1.e-16);
		fnc_values[linind] /= other(linind);
		fnc_values_[linind] /= other(linind);
		}
		}

		template <typename scalartype, class domain>
		void function<scalartype, domain>::operator=(const scalartype c) {
		for (int linind = 0; linind < nb_elements_; linind++)
		fnc_values[linind] = c;
		for (int linind = 0; linind < size(); linind++)
		fnc_values_[linind] = c;
		}

		template <typename scalartype, class domain>
		void function<scalartype, domain>::operator+=(const scalartype c) {
		for (int linind = 0; linind < nb_elements_; linind++)
		fnc_values[linind] += c;
		for (int linind = 0; linind < size(); linind++)
		fnc_values_[linind] += c;
		}

		template <typename scalartype, class domain>
		void function<scalartype, domain>::operator-=(const scalartype c) {
		for (int linind = 0; linind < nb_elements_; linind++)
		fnc_values[linind] -= c;
		for (int linind = 0; linind < size(); linind++)
		fnc_values_[linind] -= c;
		}

		template <typename scalartype, class domain>
		void function<scalartype, domain>::operator*=(const scalartype c) {
		for (int linind = 0; linind < nb_elements_; linind++)
		fnc_values[linind] *= c;
		for (int linind = 0; linind < size(); linind++)
		fnc_values_[linind] *= c;
		}

		template <typename scalartype, class domain>
		void function<scalartype, domain>::operator/=(const scalartype c) {
		for (int linind = 0; linind < nb_elements_; linind++)
		fnc_values[linind] /= c;
		for (int linind = 0; linind < size(); linind++)
		fnc_values_[linind] /= c;
		}

		template <typename scalartype, class domain>
		@@ -555,8 +541,8 @@ bool function<scalartype, domain>::operator==(const function<scalartype, domain>
		// One of the function has not been resetted after the domain was initialized.
		throw std::logic_error("Comparing functions of different sizes.");

		for (int i = 0; i < nb_elements_; ++i)
		if (other(i) != fnc_values[i])
		for (int i = 0; i < size(); ++i)
		if (other(i) != fnc_values_[i])
		return false;

		return true;
		@@ -574,7 +560,8 @@ void function<scalartype, domain>::slice(const int sbdm_index, int* subind,
		subind_2_linind(subind, linind);

		for (int i = 0; i < size_sbdm[sbdm_index]; i++)
		fnc_vals[i] = ScalarCast<new_scalartype>::execute(fnc_values[linind + i * step_sbdm[sbdm_index]]);
		fnc_vals[i] =
		ScalarCast<new_scalartype>::execute(fnc_values_[linind + i * step_sbdm[sbdm_index]]);
		}

		template <typename scalartype, class domain>
		@@ -602,12 +589,12 @@ void function<scalartype, domain>::slice(const int sbdm_index_1, const int sbdm_

		for (int j = 0; j < size_sbdm_2; j++) {
		fnc_ptr_left = &fnc_vals[0 + j * size_sbdm_1];
		fnc_ptr_right = &fnc_values[linind + j * step_sbdm_2];
		fnc_ptr_right = &fnc_values_[linind + j * step_sbdm_2];

		for (int i = 0; i < size_sbdm_1; i++)
		fnc_ptr_left[i] = fnc_ptr_right[i * step_sbdm_1];
		// fnc_vals[i+jsize_sbdm[sbdm_index_1]] = fnc_values[linind + istep_sbdm[sbdm_index_1] +
		// j*step_sbdm[sbdm_index_2]];
		// fnc_vals[i+jsize_sbdm[sbdm_index_1]] = fnc_values_[linind + istep_sbdm[sbdm_index_1]
		// + j*step_sbdm[sbdm_index_2]];
		}
		}

		@@ -623,7 +610,7 @@ void function<scalartype, domain>::distribute(const int sbdm_index, int* subind,
		subind_2_linind(subind, linind);

		for (int i = 0; i < size_sbdm[sbdm_index]; i++)
		fnc_values[linind + i * step_sbdm[sbdm_index]] = ScalarCast<scalartype>::execute(fnc_vals[i]);
		fnc_values_[linind + i * step_sbdm[sbdm_index]] = ScalarCast<scalartype>::execute(fnc_vals[i]);
		}

		template <typename scalartype, class domain>
		@@ -642,7 +629,7 @@ void function<scalartype, domain>::distribute(const int sbdm_index_1, const int

		for (int i = 0; i < size_sbdm[sbdm_index_1]; i++)
		for (int j = 0; j < size_sbdm[sbdm_index_2]; j++)
		fnc_values[linind + i * step_sbdm[sbdm_index_1] + j * step_sbdm[sbdm_index_2]] =
		fnc_values_[linind + i * step_sbdm[sbdm_index_1] + j * step_sbdm[sbdm_index_2]] =
		fnc_vals[i + j * size_sbdm[sbdm_index_1]];
		}

		@@ -661,8 +648,8 @@ void function<scalartype, domain>::print_fingerprint(std::ostream& stream) const
		stream << " " << size_sbdm[i];
		stream << "\n";

		stream << "# elements: " << nb_elements_ << "\n";
		stream << "memory: " << nb_elements_ * sizeof(scalartype) / (1024. * 1024.) << " MiB\n";
		stream << "# elements: " << size() << "\n";
		stream << "memory: " << size() * sizeof(scalartype) / (1024. * 1024.) << " MiB\n";
		stream << "****************************************\n" << std::endl;
		}

		@@ -673,11 +660,11 @@ void function<scalartype, domain>::print_elements(std::ostream& stream) const {
		stream << "****************************************\n";

		std::vector<int> subind(Nb_sbdms);
		for (int lindex = 0; lindex < nb_elements_; ++lindex) {
		for (int lindex = 0; lindex < size(); ++lindex) {
		linind_2_subind(lindex, subind);
		for (int index : subind)
		stream << index << "\t";
		stream << " \t" << fnc_values[lindex] << "\n";
		stream << " \t" << fnc_values_[lindex] << "\n";
		}

		stream << "****************************************\n" << std::endl;
		@@ -705,6 +692,15 @@ void function<scalartype, domain>::unpack(const concurrency_t& concurrency, char
		concurrency.unpack(buffer, buffer_size, position, *this);
		}

		template <typename scalartype, class domain>
		template <class Concurrency>
		function<scalartype, domain> function<scalartype, domain>::gather(const Concurrency& concurrency) const {
		function result(name_);

		concurrency.gather(*this, result, concurrency);
		return result;
		}

		} // namespace func
		} // namespace dca