Loading cmake/dca_config.cmake +8 −0 Original line number Diff line number Diff line Loading @@ -378,6 +378,14 @@ if(DCA_SYMMETRIZE) add_compile_definitions(DCA_WITH_SYMMETRIZATION) endif() ################################################################################ # Workarounds option(DCA_FIX_BROKEN_MPICH "Re-define MPI_CXX_* datatypes as the corresponding MPI_C_* datatypes when mpich is the mpi provider." OFF) if(DCA_FIX_BROKEN_MPICH) add_compile_definitions(DCA_FIX_BROKEN_MPICH) endif() ################################################################################ # Generate applications' config files. configure_file("${PROJECT_SOURCE_DIR}/include/dca/config/analysis.hpp.in" Loading include/dca/linalg/matrix.hpp +11 −12 Original line number Diff line number Diff line Loading @@ -313,14 +313,13 @@ void Matrix<ScalarType, device_name>::resize(std::pair<int, int> new_size) { assert(new_size.first >= 0 && new_size.second >= 0); if (new_size.first > capacity_.first || new_size.second > capacity_.second) { std::pair<int, int> new_capacity = capacityMultipleOfBlockSize(new_size); ValueType* new_data = nullptr; new_data = Allocator::allocate(nrElements(new_capacity)); // hip memorycpy2D routines don't tolerate leadingDimension = 0 const std::pair<int, int> copy_size(std::min(new_size.first, size_.first), std::min(new_size.second, size_.second)); util::memoryCopy(new_data, new_capacity.first, data_, leadingDimension(), copy_size); Allocator::deallocate(data_); data_ = new_data; capacity_ = new_capacity; size_ = new_size; Loading include/dca/linalg/reshapable_matrix.hpp +0 −1 Original line number Diff line number Diff line Loading @@ -348,7 +348,6 @@ void ReshapableMatrix<ScalarType, device_name, Allocator>::setAsync( template <typename ScalarType, DeviceType device_name, class Allocator> std::size_t ReshapableMatrix<ScalarType, device_name, Allocator>::nextCapacity(const std::size_t size) { assert(size >= 0); constexpr std::size_t block_size = 512; auto next_power_of_two = [](std::size_t x) { Loading include/dca/linalg/util/copy.hpp +8 −1 Original line number Diff line number Diff line Loading @@ -40,7 +40,6 @@ void memoryCopyCpu(ScalarType* dest, int ld_dest, const ScalarType* src, int ld_ assert(size.first <= ld_src); assert(size.first >= 0); assert(size.second >= 0); size_t ncols = size.second; for (size_t i = 0; i < ncols; ++i) { memoryCopyCpu(dest + i * ld_dest, src + i * ld_src, size.first); Loading @@ -53,6 +52,8 @@ void memoryCopyCpu(ScalarType* dest, int ld_dest, const ScalarType* src, int ld_ // The host continues the execution of the program when the copy is terminated. template <typename ScalarType> void memoryCopy(ScalarType* dest, const ScalarType* src, size_t size) { if (size == 0) return; cudaError_t ret = cudaMemcpy(dest, src, size * sizeof(ScalarType), cudaMemcpyDefault); checkRC(ret); } Loading @@ -64,6 +65,8 @@ void memoryCopy(ScalarType* dest, const ScalarType* src, size_t size) { template <typename ScalarType> void memoryCopy(ScalarType* dest, int ld_dest, const ScalarType* src, int ld_src, std::pair<int, int> size) { if (ld_dest == 0 || ld_src == 0 || (size.first == 0 && size.second == 0)) return; cudaError_t ret = cudaMemcpy2D(dest, ld_dest * sizeof(ScalarType), src, ld_src * sizeof(ScalarType), size.first * sizeof(ScalarType), size.second, cudaMemcpyDefault); try { Loading @@ -78,6 +81,8 @@ void memoryCopy(ScalarType* dest, int ld_dest, const ScalarType* src, int ld_src // Asynchronous 1D memory copy. template <typename ScalarType> void memoryCopyAsync(ScalarType* dest, const ScalarType* src, size_t size, const cudaStream_t stream) { if (size == 0) return; cudaError_t ret = cudaMemcpyAsync(dest, src, size * sizeof(ScalarType), cudaMemcpyDefault, stream); try { checkRC(ret); Loading @@ -102,6 +107,8 @@ void memoryCopyAsync(ScalarType* dest, const ScalarType* src, size_t size, int t template <typename ScalarType> void memoryCopyAsync(ScalarType* dest, int ld_dest, const ScalarType* src, int ld_src, std::pair<int, int> size, const cudaStream_t stream) { if (ld_dest == 0 || ld_src == 0 || (size.first == 0 && size.second == 0)) return; cudaError_t ret = cudaMemcpy2DAsync(dest, ld_dest * sizeof(ScalarType), src, ld_src * sizeof(ScalarType), size.first * sizeof(ScalarType), size.second, cudaMemcpyDefault, stream); Loading include/dca/parallel/mpi_concurrency/dca_mpi.h 0 → 100644 +19 −0 Original line number Diff line number Diff line #include <mpi.h> #ifdef MPICH_NUMVERSION #ifdef DCA_FIX_BROKEN_MPICH /* Fix broken MPI-3 C++ types due to bad compiles of mpich */ #undef MPI_CXX_BOOL #define MPI_CXX_BOOL MPI_C_BOOL #undef MPI_CXX_FLOAT_COMPLEX #define MPI_CXX_FLOAT_COMPLEX MPI_C_FLOAT_COMPLEX #undef MPI_CXX_DOUBLE_COMPLEX #define MPI_CXX_DOUBLE_COMPLEX MPI_C_DOUBLE_COMPLEX #undef MPI_CXX_LONG_DOUBLE_COMPLEX #define MPI_CXX_LONG_DOUBLE_COMPLEX MPI_C_LONG_DOUBLE_COMPLEX #endif // DCA_FIX_BROKEN_MPICH #endif // MPICH_NUMVERSION Loading
cmake/dca_config.cmake +8 −0 Original line number Diff line number Diff line Loading @@ -378,6 +378,14 @@ if(DCA_SYMMETRIZE) add_compile_definitions(DCA_WITH_SYMMETRIZATION) endif() ################################################################################ # Workarounds option(DCA_FIX_BROKEN_MPICH "Re-define MPI_CXX_* datatypes as the corresponding MPI_C_* datatypes when mpich is the mpi provider." OFF) if(DCA_FIX_BROKEN_MPICH) add_compile_definitions(DCA_FIX_BROKEN_MPICH) endif() ################################################################################ # Generate applications' config files. configure_file("${PROJECT_SOURCE_DIR}/include/dca/config/analysis.hpp.in" Loading
include/dca/linalg/matrix.hpp +11 −12 Original line number Diff line number Diff line Loading @@ -313,14 +313,13 @@ void Matrix<ScalarType, device_name>::resize(std::pair<int, int> new_size) { assert(new_size.first >= 0 && new_size.second >= 0); if (new_size.first > capacity_.first || new_size.second > capacity_.second) { std::pair<int, int> new_capacity = capacityMultipleOfBlockSize(new_size); ValueType* new_data = nullptr; new_data = Allocator::allocate(nrElements(new_capacity)); // hip memorycpy2D routines don't tolerate leadingDimension = 0 const std::pair<int, int> copy_size(std::min(new_size.first, size_.first), std::min(new_size.second, size_.second)); util::memoryCopy(new_data, new_capacity.first, data_, leadingDimension(), copy_size); Allocator::deallocate(data_); data_ = new_data; capacity_ = new_capacity; size_ = new_size; Loading
include/dca/linalg/reshapable_matrix.hpp +0 −1 Original line number Diff line number Diff line Loading @@ -348,7 +348,6 @@ void ReshapableMatrix<ScalarType, device_name, Allocator>::setAsync( template <typename ScalarType, DeviceType device_name, class Allocator> std::size_t ReshapableMatrix<ScalarType, device_name, Allocator>::nextCapacity(const std::size_t size) { assert(size >= 0); constexpr std::size_t block_size = 512; auto next_power_of_two = [](std::size_t x) { Loading
include/dca/linalg/util/copy.hpp +8 −1 Original line number Diff line number Diff line Loading @@ -40,7 +40,6 @@ void memoryCopyCpu(ScalarType* dest, int ld_dest, const ScalarType* src, int ld_ assert(size.first <= ld_src); assert(size.first >= 0); assert(size.second >= 0); size_t ncols = size.second; for (size_t i = 0; i < ncols; ++i) { memoryCopyCpu(dest + i * ld_dest, src + i * ld_src, size.first); Loading @@ -53,6 +52,8 @@ void memoryCopyCpu(ScalarType* dest, int ld_dest, const ScalarType* src, int ld_ // The host continues the execution of the program when the copy is terminated. template <typename ScalarType> void memoryCopy(ScalarType* dest, const ScalarType* src, size_t size) { if (size == 0) return; cudaError_t ret = cudaMemcpy(dest, src, size * sizeof(ScalarType), cudaMemcpyDefault); checkRC(ret); } Loading @@ -64,6 +65,8 @@ void memoryCopy(ScalarType* dest, const ScalarType* src, size_t size) { template <typename ScalarType> void memoryCopy(ScalarType* dest, int ld_dest, const ScalarType* src, int ld_src, std::pair<int, int> size) { if (ld_dest == 0 || ld_src == 0 || (size.first == 0 && size.second == 0)) return; cudaError_t ret = cudaMemcpy2D(dest, ld_dest * sizeof(ScalarType), src, ld_src * sizeof(ScalarType), size.first * sizeof(ScalarType), size.second, cudaMemcpyDefault); try { Loading @@ -78,6 +81,8 @@ void memoryCopy(ScalarType* dest, int ld_dest, const ScalarType* src, int ld_src // Asynchronous 1D memory copy. template <typename ScalarType> void memoryCopyAsync(ScalarType* dest, const ScalarType* src, size_t size, const cudaStream_t stream) { if (size == 0) return; cudaError_t ret = cudaMemcpyAsync(dest, src, size * sizeof(ScalarType), cudaMemcpyDefault, stream); try { checkRC(ret); Loading @@ -102,6 +107,8 @@ void memoryCopyAsync(ScalarType* dest, const ScalarType* src, size_t size, int t template <typename ScalarType> void memoryCopyAsync(ScalarType* dest, int ld_dest, const ScalarType* src, int ld_src, std::pair<int, int> size, const cudaStream_t stream) { if (ld_dest == 0 || ld_src == 0 || (size.first == 0 && size.second == 0)) return; cudaError_t ret = cudaMemcpy2DAsync(dest, ld_dest * sizeof(ScalarType), src, ld_src * sizeof(ScalarType), size.first * sizeof(ScalarType), size.second, cudaMemcpyDefault, stream); Loading
include/dca/parallel/mpi_concurrency/dca_mpi.h 0 → 100644 +19 −0 Original line number Diff line number Diff line #include <mpi.h> #ifdef MPICH_NUMVERSION #ifdef DCA_FIX_BROKEN_MPICH /* Fix broken MPI-3 C++ types due to bad compiles of mpich */ #undef MPI_CXX_BOOL #define MPI_CXX_BOOL MPI_C_BOOL #undef MPI_CXX_FLOAT_COMPLEX #define MPI_CXX_FLOAT_COMPLEX MPI_C_FLOAT_COMPLEX #undef MPI_CXX_DOUBLE_COMPLEX #define MPI_CXX_DOUBLE_COMPLEX MPI_C_DOUBLE_COMPLEX #undef MPI_CXX_LONG_DOUBLE_COMPLEX #define MPI_CXX_LONG_DOUBLE_COMPLEX MPI_C_LONG_DOUBLE_COMPLEX #endif // DCA_FIX_BROKEN_MPICH #endif // MPICH_NUMVERSION