Loading graph_framework/cuda_context.hpp +17 −20 Original line number Diff line number Diff line Loading @@ -179,8 +179,9 @@ namespace gpu { check_error(cuDeviceGetName(device_name, 100, device), "cuDeviceGetName"); std::cout << " Device name : " << device_name << std::endl; const std::string temp = arch.str(); std::array<const char *, 3> options({ arch.str().c_str(), temp.c_str(), "--std=c++17", "--include-path=" CUDA_INCLUDE }); Loading @@ -195,6 +196,7 @@ namespace gpu { "nvrtcGetProgramLog"); std::cout << log << std::endl; free(log); std::cout << kernel_source << std::endl; } const char *mangled_kernel_name; Loading Loading @@ -321,7 +323,6 @@ namespace gpu { /// /// @params[in] index Number of times to record. //------------------------------------------------------------------------------ template<typename T> void print_results(const size_t index) { wait(); for (CUdeviceptr &buffer : buffers) { Loading @@ -336,7 +337,6 @@ namespace gpu { /// @params[in] source_index Index of the GPU buffer. /// @params[in,out] destination Host side buffer to copy to. //------------------------------------------------------------------------------ template<typename T> void copy_buffer(const size_t source_index, T *destination) { size_t size; Loading Loading @@ -391,7 +391,7 @@ namespace gpu { } source_buffer << ") {" << std::endl; source_buffer << " const index = blockIdx.x*blockDim.x + threadIdx.x;" source_buffer << " const int index = blockIdx.x*blockDim.x + threadIdx.x;" << std::endl; source_buffer << " if (index < " << size << ") {" << std::endl; Loading Loading @@ -454,25 +454,22 @@ namespace gpu { source_buffer << " const unsigned int j = threadIdx.x/32;" << std::endl; source_buffer << " const unsigned int k = threadIdx.x%32;" << std::endl; source_buffer << " if (i < " << size << ") {" << std::endl; source_buffer << " " jit::add_type<T> (source_buffer); source_buffer << " sub_max = input[i];" << std::endl; source_buffer << " " << jit::type_to_string<T> () << " sub_max = "; if constexpr (jit::is_complex<T> ()) { source_buffer << "abs(input[i]);" << std::endl; } else { source_buffer << "input[i];" << std::endl; } source_buffer << " for (size_t index = i + 1024; index < " << size <<"; index += 1024) {" << std::endl; if constexpr (jit::is_complex<T> ()) { source_buffer << " sub_max = max(abs(sub_max), abs(input[index]));" << std::endl; source_buffer << " sub_max = max(sub_max, abs(input[index]));" << std::endl; } else { source_buffer << " sub_max = max(sub_max, input[index]);" << std::endl; } source_buffer << " }" << std::endl; source_buffer << " __shared__ "; jit::add_type<T> (source_buffer); source_buffer << " thread_max[32];" << std::endl; source_buffer << " __shared__ " << jit::type_to_string<T> () << " thread_max[32];" << std::endl; source_buffer << " for (int index = 16; index > 0; index /= 2) {" << std::endl; if constexpr (jit::is_complex<T> ()) { source_buffer << " sub_max = max(abs(sub_max), abs(__shfl_down_sync(__activemask(), sub_max, index)));" << std::endl; } else { source_buffer << " sub_max = max(sub_max, __shfl_down_sync(__activemask(), sub_max, index));" << std::endl; } source_buffer << " }" << std::endl; source_buffer << " thread_max[j] = sub_max;" << std::endl; source_buffer << " __syncthreads();" << std::endl; Loading Loading
graph_framework/cuda_context.hpp +17 −20 Original line number Diff line number Diff line Loading @@ -179,8 +179,9 @@ namespace gpu { check_error(cuDeviceGetName(device_name, 100, device), "cuDeviceGetName"); std::cout << " Device name : " << device_name << std::endl; const std::string temp = arch.str(); std::array<const char *, 3> options({ arch.str().c_str(), temp.c_str(), "--std=c++17", "--include-path=" CUDA_INCLUDE }); Loading @@ -195,6 +196,7 @@ namespace gpu { "nvrtcGetProgramLog"); std::cout << log << std::endl; free(log); std::cout << kernel_source << std::endl; } const char *mangled_kernel_name; Loading Loading @@ -321,7 +323,6 @@ namespace gpu { /// /// @params[in] index Number of times to record. //------------------------------------------------------------------------------ template<typename T> void print_results(const size_t index) { wait(); for (CUdeviceptr &buffer : buffers) { Loading @@ -336,7 +337,6 @@ namespace gpu { /// @params[in] source_index Index of the GPU buffer. /// @params[in,out] destination Host side buffer to copy to. //------------------------------------------------------------------------------ template<typename T> void copy_buffer(const size_t source_index, T *destination) { size_t size; Loading Loading @@ -391,7 +391,7 @@ namespace gpu { } source_buffer << ") {" << std::endl; source_buffer << " const index = blockIdx.x*blockDim.x + threadIdx.x;" source_buffer << " const int index = blockIdx.x*blockDim.x + threadIdx.x;" << std::endl; source_buffer << " if (index < " << size << ") {" << std::endl; Loading Loading @@ -454,25 +454,22 @@ namespace gpu { source_buffer << " const unsigned int j = threadIdx.x/32;" << std::endl; source_buffer << " const unsigned int k = threadIdx.x%32;" << std::endl; source_buffer << " if (i < " << size << ") {" << std::endl; source_buffer << " " jit::add_type<T> (source_buffer); source_buffer << " sub_max = input[i];" << std::endl; source_buffer << " " << jit::type_to_string<T> () << " sub_max = "; if constexpr (jit::is_complex<T> ()) { source_buffer << "abs(input[i]);" << std::endl; } else { source_buffer << "input[i];" << std::endl; } source_buffer << " for (size_t index = i + 1024; index < " << size <<"; index += 1024) {" << std::endl; if constexpr (jit::is_complex<T> ()) { source_buffer << " sub_max = max(abs(sub_max), abs(input[index]));" << std::endl; source_buffer << " sub_max = max(sub_max, abs(input[index]));" << std::endl; } else { source_buffer << " sub_max = max(sub_max, input[index]);" << std::endl; } source_buffer << " }" << std::endl; source_buffer << " __shared__ "; jit::add_type<T> (source_buffer); source_buffer << " thread_max[32];" << std::endl; source_buffer << " __shared__ " << jit::type_to_string<T> () << " thread_max[32];" << std::endl; source_buffer << " for (int index = 16; index > 0; index /= 2) {" << std::endl; if constexpr (jit::is_complex<T> ()) { source_buffer << " sub_max = max(abs(sub_max), abs(__shfl_down_sync(__activemask(), sub_max, index)));" << std::endl; } else { source_buffer << " sub_max = max(sub_max, __shfl_down_sync(__activemask(), sub_max, index));" << std::endl; } source_buffer << " }" << std::endl; source_buffer << " thread_max[j] = sub_max;" << std::endl; source_buffer << " __syncthreads();" << std::endl; Loading