Loading src/linalg/lapack/multiply_diagonal_gpu.cu +18 −25 Original line number Diff line number Diff line Loading @@ -5,7 +5,8 @@ // See LICENSE for terms of usage. // See CITATION.md for citation guidelines, if DCA++ is used for scientific publications. // // Author: Raffaele Solca' (rasolca@itp.phys.ethz.ch) // Author: Giovanni Balduzzi (gbalduzz@itp.phys.ethz.ch) // Raffaele Solca' (rasolca@itp.phys.ethz.ch) // // This file implements laset_gpu.hpp. Loading @@ -24,20 +25,16 @@ namespace lapack { namespace kernels { // dca::linalg::lapack::kernels:: constexpr int multiply_diag_block_size_x = 128; constexpr int multiply_diag_block_size_x = 32; constexpr int multiply_diag_block_size_y = 32; template <typename Type> __global__ void multiplyDiagonalLeft(int m, int n, const Type* d, int inc_d, const Type* a, int lda, Type* b, int ldb) { // Work on a tile of size (blockDim.x x multiply_diag_block_size_y). int i = threadIdx.x + blockIdx.x * blockDim.x; const int i = threadIdx.x + blockIdx.x * blockDim.x; const int j = threadIdx.y + blockIdx.y * blockDim.y; if (i < m) { int js = blockIdx.y * multiply_diag_block_size_y; int je = min(n, (blockIdx.y + 1) * blockDim.x); for (int j = js; j < je; ++j) if (i < m && j < n) { b[i + j * ldb] = d[i * inc_d] * a[i + j * lda]; } } Loading @@ -45,14 +42,10 @@ __global__ void multiplyDiagonalLeft(int m, int n, const Type* d, int inc_d, con template <typename Type> __global__ void multiplyDiagonalRight(int m, int n, const Type* a, int lda, const Type* d, int inc_d, Type* b, int ldb) { // Work on a tile of size (blockDim.x x multiply_diag_block_size_y). int i = threadIdx.x + blockIdx.x * blockDim.x; if (i < m) { int js = blockIdx.y * multiply_diag_block_size_y; int je = min(n, (blockIdx.y + 1) * blockDim.x); const int i = threadIdx.x + blockIdx.x * blockDim.x; const int j = threadIdx.y + blockIdx.y * blockDim.y; for (int j = js; j < je; ++j) if (i < m && j < n) { b[i + j * ldb] = d[j * inc_d] * a[i + j * lda]; } } Loading @@ -68,11 +61,11 @@ void multiplyDiagonalLeft_gpu(int m, int n, const Type* d, int inc_d, const Type if (m > 0 && n > 0) { checkErrorsCudaDebug(); int bl_x = dca::util::ceilDiv(m, kernels::multiply_diag_block_size_x); int bl_y = dca::util::ceilDiv(n, kernels::multiply_diag_block_size_y); const int bl_x = dca::util::ceilDiv(m, kernels::multiply_diag_block_size_x); const int bl_y = dca::util::ceilDiv(n, kernels::multiply_diag_block_size_y); dim3 threads(kernels::multiply_diag_block_size_x); dim3 blocks(bl_x, bl_y); const dim3 threads(kernels::multiply_diag_block_size_x, kernels::multiply_diag_block_size_y); const dim3 blocks(bl_x, bl_y); cudaStream_t stream = dca::linalg::util::getStream(thread_id, stream_id); Loading @@ -99,11 +92,11 @@ void multiplyDiagonalRight_gpu(int m, int n, const Type* a, int lda, const Type* if (m > 0 && n > 0) { checkErrorsCudaDebug(); int bl_x = dca::util::ceilDiv(m, kernels::multiply_diag_block_size_x); int bl_y = dca::util::ceilDiv(n, kernels::multiply_diag_block_size_y); const int bl_x = dca::util::ceilDiv(m, kernels::multiply_diag_block_size_x); const int bl_y = dca::util::ceilDiv(n, kernels::multiply_diag_block_size_y); dim3 threads(kernels::multiply_diag_block_size_x); dim3 blocks(bl_x, bl_y); const dim3 threads(kernels::multiply_diag_block_size_x, kernels::multiply_diag_block_size_y); const dim3 blocks(bl_x, bl_y); cudaStream_t stream = dca::linalg::util::getStream(thread_id, stream_id); Loading Loading
src/linalg/lapack/multiply_diagonal_gpu.cu +18 −25 Original line number Diff line number Diff line Loading @@ -5,7 +5,8 @@ // See LICENSE for terms of usage. // See CITATION.md for citation guidelines, if DCA++ is used for scientific publications. // // Author: Raffaele Solca' (rasolca@itp.phys.ethz.ch) // Author: Giovanni Balduzzi (gbalduzz@itp.phys.ethz.ch) // Raffaele Solca' (rasolca@itp.phys.ethz.ch) // // This file implements laset_gpu.hpp. Loading @@ -24,20 +25,16 @@ namespace lapack { namespace kernels { // dca::linalg::lapack::kernels:: constexpr int multiply_diag_block_size_x = 128; constexpr int multiply_diag_block_size_x = 32; constexpr int multiply_diag_block_size_y = 32; template <typename Type> __global__ void multiplyDiagonalLeft(int m, int n, const Type* d, int inc_d, const Type* a, int lda, Type* b, int ldb) { // Work on a tile of size (blockDim.x x multiply_diag_block_size_y). int i = threadIdx.x + blockIdx.x * blockDim.x; const int i = threadIdx.x + blockIdx.x * blockDim.x; const int j = threadIdx.y + blockIdx.y * blockDim.y; if (i < m) { int js = blockIdx.y * multiply_diag_block_size_y; int je = min(n, (blockIdx.y + 1) * blockDim.x); for (int j = js; j < je; ++j) if (i < m && j < n) { b[i + j * ldb] = d[i * inc_d] * a[i + j * lda]; } } Loading @@ -45,14 +42,10 @@ __global__ void multiplyDiagonalLeft(int m, int n, const Type* d, int inc_d, con template <typename Type> __global__ void multiplyDiagonalRight(int m, int n, const Type* a, int lda, const Type* d, int inc_d, Type* b, int ldb) { // Work on a tile of size (blockDim.x x multiply_diag_block_size_y). int i = threadIdx.x + blockIdx.x * blockDim.x; if (i < m) { int js = blockIdx.y * multiply_diag_block_size_y; int je = min(n, (blockIdx.y + 1) * blockDim.x); const int i = threadIdx.x + blockIdx.x * blockDim.x; const int j = threadIdx.y + blockIdx.y * blockDim.y; for (int j = js; j < je; ++j) if (i < m && j < n) { b[i + j * ldb] = d[j * inc_d] * a[i + j * lda]; } } Loading @@ -68,11 +61,11 @@ void multiplyDiagonalLeft_gpu(int m, int n, const Type* d, int inc_d, const Type if (m > 0 && n > 0) { checkErrorsCudaDebug(); int bl_x = dca::util::ceilDiv(m, kernels::multiply_diag_block_size_x); int bl_y = dca::util::ceilDiv(n, kernels::multiply_diag_block_size_y); const int bl_x = dca::util::ceilDiv(m, kernels::multiply_diag_block_size_x); const int bl_y = dca::util::ceilDiv(n, kernels::multiply_diag_block_size_y); dim3 threads(kernels::multiply_diag_block_size_x); dim3 blocks(bl_x, bl_y); const dim3 threads(kernels::multiply_diag_block_size_x, kernels::multiply_diag_block_size_y); const dim3 blocks(bl_x, bl_y); cudaStream_t stream = dca::linalg::util::getStream(thread_id, stream_id); Loading @@ -99,11 +92,11 @@ void multiplyDiagonalRight_gpu(int m, int n, const Type* a, int lda, const Type* if (m > 0 && n > 0) { checkErrorsCudaDebug(); int bl_x = dca::util::ceilDiv(m, kernels::multiply_diag_block_size_x); int bl_y = dca::util::ceilDiv(n, kernels::multiply_diag_block_size_y); const int bl_x = dca::util::ceilDiv(m, kernels::multiply_diag_block_size_x); const int bl_y = dca::util::ceilDiv(n, kernels::multiply_diag_block_size_y); dim3 threads(kernels::multiply_diag_block_size_x); dim3 blocks(bl_x, bl_y); const dim3 threads(kernels::multiply_diag_block_size_x, kernels::multiply_diag_block_size_y); const dim3 blocks(bl_x, bl_y); cudaStream_t stream = dca::linalg::util::getStream(thread_id, stream_id); Loading