Loading graph_framework/cuda_context.hpp +113 −32 Original line number Diff line number Diff line Loading @@ -9,6 +9,7 @@ #define cuda_context_h #import <vector> #import <array> #import <cuda.h> #import <nvrtc.h> Loading @@ -26,9 +27,9 @@ namespace gpu { /// The cuda context. CUcontext context; /// The cuda code library. CUlibrary library; CUmodule module; /// The cuda kernel; CUkernel kernel; CUfunction function; /// Buffer objects. std::vector<CUdeviceptr> buffers; /// Cuda stream. Loading @@ -37,6 +38,16 @@ namespace gpu { unsigned int thread_groups; /// Number of threads in a group. unsigned int threads_per_group; /// Result buffers. std::vector<void *> result_buffers; /// Index offset. size_t buffer_offset; /// Buffer element size. size_t buffer_element_size; /// Time offset. size_t time_offset; /// Result buffer size; size_t result_size; public: //------------------------------------------------------------------------------ Loading @@ -52,7 +63,7 @@ namespace gpu { /// @brief Cuda context destructor. //------------------------------------------------------------------------------ ~cuda_context() { cuLibraryUnload(library); cuModuleUnload(module); for (CUdeviceptr &ptr : buffers) { cuMemFree(ptr); Loading @@ -68,13 +79,19 @@ namespace gpu { /// @param[in] kernel_source Source code buffer for the kernel. /// @param[in] kernel_name Name of the kernel for later reference. /// @param[in] inputs Input nodes of the kernel. /// @param[in] outputs Output nodes of the kernel. /// @param[in] num_rays Number of rays to trace. /// @param[in] num_times Number of times to record. /// @param[in] ray_index Index of the ray to save. //------------------------------------------------------------------------------ template<class BACKEND> void create_pipeline(const std::string kernel_source, const std::string kernel_name, graph::input_nodes<BACKEND> inputs, const size_t num_rays) { graph::output_nodes<BACKEND> outputs, const size_t num_rays, const size_t num_times, const size_t ray_index) { nvrtcProgram kernel_program; nvrtcCreateProgram(&kernel_program, kernel_source.c_str(), Loading @@ -88,65 +105,129 @@ namespace gpu { std::cout << " Major compute capability : " << compute_version << std::endl; // FIXME: Hardcoded for ada gpus for now. const char **options = { std::array<char *, 2> options({ "--gpu-architecture=compute_90", "--std=c++20" }; nvrtcCompileProgram(kernel_program, 2, options); }); nvrtcCompileProgram(kernel_program, 2, options.data()); char *mangled_kernel_name; nvrtcGetLoweredName(kernel_program, kernel_name.c_str(), &mangled_kernel_name); const_cast<const char **> (&mangled_kernel_name)); std::cout << " Mangled Kernel Name : " << mangled_kernel_name << std::endl; size_t ptx_size; nvrtcGetPTXSize(kernel_program, &ptx_size); char *ptx = malloc(ptx_size); char *ptx = static_cast<char *> (malloc(ptx_size)); nvrtcGetPTX(kernel_program, ptx); cuLibraryLoadData(&library, ptx, NULL, NULL, 0, NULL, NULL, 0); cuLibraryGetKernel(&kernel, library, mangled_kernel_name); cuModuleLoadDataEx(&module, ptx, 0, NULL, NULL); cuModuleGetFunction(&function, module, mangled_kernel_name); free(ptx); buffer_element_size = sizeof(typename BACKEND::base); buffer_offset = ray_index*buffer_element_size; time_offset = 0; result_size = num_times*buffer_element_size; for (graph::shared_variable<BACKEND> &input : inputs) { const BACKEND backend = input->evaluate(); CUdeviceptr ptr; const size_t bytes = backend.size()*sizeof(typename BACKEND::base); cuMemAlloc(&ptr, bytes); cuMemcpuHtoD(ptr, &backend[0], bytes); cuMemAlloc(&ptr, backend.size()*buffer_element_size); cuMemcpyHtoD(ptr, &backend[0], backend.size()*buffer_element_size); buffers.push_back(ptr); void *hptr; cuMemHostAlloc(&hptr, result_size, 0); result_buffers.push_back(hptr); } for (graph::shared_leaf<BACKEND> &output : outputs) { const BACKEND backend = output->evaluate(); CUdeviceptr ptr; cuMemAlloc(&ptr, backend.size()*buffer_element_size); cuMemcpyHtoD(ptr, &backend[0], backend.size()*buffer_element_size); buffers.push_back(ptr); void *hptr; cuMemHostAlloc(&hptr, result_size, 0); result_buffers.push_back(hptr); } int value; cuKernelGetAttribute(&value, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, kernel, device); cuFuncGetAttribute(&value, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, function); threads_per_group = value; thread_groups = num_rays/threads_per_group + (num_rays%threads_per_group ? 1 : 0); std::cout << " Threads per group : " << threads_per_group << std::endl; std::cout << " Number of groups : " << thread_groups << std::endl; std::cout << " Total problem size : " << threads_per_group*thread_groups << std::endl; encode_blit(); } //------------------------------------------------------------------------------ /// @brief Encode a blit command to the stream. /// /// blit is the metal terminology for a memcopy operation added to the command /// stream. Don't know what the cuda term is. //------------------------------------------------------------------------------ void encode_blit() { for (size_t i = 0, ie = buffers.size(); i < ie; i++) { CUdeviceptr hdptr; cuMemHostGetDevicePointer(&hdptr, result_buffers[i], 0); cuMemcpyDtoDAsync(hdptr + time_offset, buffers[i] + buffer_offset, buffer_element_size, stream); } } //------------------------------------------------------------------------------ /// @brief Perform a time step. /// /// This calls dispatches a kernel instance to the command buffer and the commits /// the job. This method is asyncronus. /// This calls dispatches a kernel instance to the command buffer and the /// commits the job. This method is asynchronous. //------------------------------------------------------------------------------ void step() { cuLaunchKernel(kernel, threads_per_group, 0, 0, thread_groups, 0, 0, NULL, stream, buffers.data(), NULL); cuLaunchKernel(function, threads_per_group, 0, 0, thread_groups, 0, 0, NULL, stream, reinterpret_cast<void**> (buffers.data()), NULL); encode_blit(); } //------------------------------------------------------------------------------ /// @brief Hold the current thread until the current command buffer has complete. /// @brief Hold the current thread until the stream has completed. //------------------------------------------------------------------------------ void wait() { for (void *hptr : result_buffers) { CUdeviceptr hdptr; cuMemHostGetDevicePointer(&hdptr, hptr, 0); cuMemcpyDtoHAsync(hptr, hdptr, result_size, stream); } cuStreamSynchronize(stream); } //------------------------------------------------------------------------------ /// @brief Print out the results. /// /// @param[in] num_times Number of times to record. //------------------------------------------------------------------------------ template<class BACKEND> void print_results(const size_t num_times) { for (size_t i = 0, ie = num_times + 1; i < ie; i++) { std::cout << i << " "; for (void *buffer : result_buffers) { std::cout << *(static_cast<typename BACKEND::base *> (buffer) + i) << " "; } std::cout << std::endl; } std::cout << std::endl; } }; } Loading Loading
graph_framework/cuda_context.hpp +113 −32 Original line number Diff line number Diff line Loading @@ -9,6 +9,7 @@ #define cuda_context_h #import <vector> #import <array> #import <cuda.h> #import <nvrtc.h> Loading @@ -26,9 +27,9 @@ namespace gpu { /// The cuda context. CUcontext context; /// The cuda code library. CUlibrary library; CUmodule module; /// The cuda kernel; CUkernel kernel; CUfunction function; /// Buffer objects. std::vector<CUdeviceptr> buffers; /// Cuda stream. Loading @@ -37,6 +38,16 @@ namespace gpu { unsigned int thread_groups; /// Number of threads in a group. unsigned int threads_per_group; /// Result buffers. std::vector<void *> result_buffers; /// Index offset. size_t buffer_offset; /// Buffer element size. size_t buffer_element_size; /// Time offset. size_t time_offset; /// Result buffer size; size_t result_size; public: //------------------------------------------------------------------------------ Loading @@ -52,7 +63,7 @@ namespace gpu { /// @brief Cuda context destructor. //------------------------------------------------------------------------------ ~cuda_context() { cuLibraryUnload(library); cuModuleUnload(module); for (CUdeviceptr &ptr : buffers) { cuMemFree(ptr); Loading @@ -68,13 +79,19 @@ namespace gpu { /// @param[in] kernel_source Source code buffer for the kernel. /// @param[in] kernel_name Name of the kernel for later reference. /// @param[in] inputs Input nodes of the kernel. /// @param[in] outputs Output nodes of the kernel. /// @param[in] num_rays Number of rays to trace. /// @param[in] num_times Number of times to record. /// @param[in] ray_index Index of the ray to save. //------------------------------------------------------------------------------ template<class BACKEND> void create_pipeline(const std::string kernel_source, const std::string kernel_name, graph::input_nodes<BACKEND> inputs, const size_t num_rays) { graph::output_nodes<BACKEND> outputs, const size_t num_rays, const size_t num_times, const size_t ray_index) { nvrtcProgram kernel_program; nvrtcCreateProgram(&kernel_program, kernel_source.c_str(), Loading @@ -88,65 +105,129 @@ namespace gpu { std::cout << " Major compute capability : " << compute_version << std::endl; // FIXME: Hardcoded for ada gpus for now. const char **options = { std::array<char *, 2> options({ "--gpu-architecture=compute_90", "--std=c++20" }; nvrtcCompileProgram(kernel_program, 2, options); }); nvrtcCompileProgram(kernel_program, 2, options.data()); char *mangled_kernel_name; nvrtcGetLoweredName(kernel_program, kernel_name.c_str(), &mangled_kernel_name); const_cast<const char **> (&mangled_kernel_name)); std::cout << " Mangled Kernel Name : " << mangled_kernel_name << std::endl; size_t ptx_size; nvrtcGetPTXSize(kernel_program, &ptx_size); char *ptx = malloc(ptx_size); char *ptx = static_cast<char *> (malloc(ptx_size)); nvrtcGetPTX(kernel_program, ptx); cuLibraryLoadData(&library, ptx, NULL, NULL, 0, NULL, NULL, 0); cuLibraryGetKernel(&kernel, library, mangled_kernel_name); cuModuleLoadDataEx(&module, ptx, 0, NULL, NULL); cuModuleGetFunction(&function, module, mangled_kernel_name); free(ptx); buffer_element_size = sizeof(typename BACKEND::base); buffer_offset = ray_index*buffer_element_size; time_offset = 0; result_size = num_times*buffer_element_size; for (graph::shared_variable<BACKEND> &input : inputs) { const BACKEND backend = input->evaluate(); CUdeviceptr ptr; const size_t bytes = backend.size()*sizeof(typename BACKEND::base); cuMemAlloc(&ptr, bytes); cuMemcpuHtoD(ptr, &backend[0], bytes); cuMemAlloc(&ptr, backend.size()*buffer_element_size); cuMemcpyHtoD(ptr, &backend[0], backend.size()*buffer_element_size); buffers.push_back(ptr); void *hptr; cuMemHostAlloc(&hptr, result_size, 0); result_buffers.push_back(hptr); } for (graph::shared_leaf<BACKEND> &output : outputs) { const BACKEND backend = output->evaluate(); CUdeviceptr ptr; cuMemAlloc(&ptr, backend.size()*buffer_element_size); cuMemcpyHtoD(ptr, &backend[0], backend.size()*buffer_element_size); buffers.push_back(ptr); void *hptr; cuMemHostAlloc(&hptr, result_size, 0); result_buffers.push_back(hptr); } int value; cuKernelGetAttribute(&value, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, kernel, device); cuFuncGetAttribute(&value, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, function); threads_per_group = value; thread_groups = num_rays/threads_per_group + (num_rays%threads_per_group ? 1 : 0); std::cout << " Threads per group : " << threads_per_group << std::endl; std::cout << " Number of groups : " << thread_groups << std::endl; std::cout << " Total problem size : " << threads_per_group*thread_groups << std::endl; encode_blit(); } //------------------------------------------------------------------------------ /// @brief Encode a blit command to the stream. /// /// blit is the metal terminology for a memcopy operation added to the command /// stream. Don't know what the cuda term is. //------------------------------------------------------------------------------ void encode_blit() { for (size_t i = 0, ie = buffers.size(); i < ie; i++) { CUdeviceptr hdptr; cuMemHostGetDevicePointer(&hdptr, result_buffers[i], 0); cuMemcpyDtoDAsync(hdptr + time_offset, buffers[i] + buffer_offset, buffer_element_size, stream); } } //------------------------------------------------------------------------------ /// @brief Perform a time step. /// /// This calls dispatches a kernel instance to the command buffer and the commits /// the job. This method is asyncronus. /// This calls dispatches a kernel instance to the command buffer and the /// commits the job. This method is asynchronous. //------------------------------------------------------------------------------ void step() { cuLaunchKernel(kernel, threads_per_group, 0, 0, thread_groups, 0, 0, NULL, stream, buffers.data(), NULL); cuLaunchKernel(function, threads_per_group, 0, 0, thread_groups, 0, 0, NULL, stream, reinterpret_cast<void**> (buffers.data()), NULL); encode_blit(); } //------------------------------------------------------------------------------ /// @brief Hold the current thread until the current command buffer has complete. /// @brief Hold the current thread until the stream has completed. //------------------------------------------------------------------------------ void wait() { for (void *hptr : result_buffers) { CUdeviceptr hdptr; cuMemHostGetDevicePointer(&hdptr, hptr, 0); cuMemcpyDtoHAsync(hptr, hdptr, result_size, stream); } cuStreamSynchronize(stream); } //------------------------------------------------------------------------------ /// @brief Print out the results. /// /// @param[in] num_times Number of times to record. //------------------------------------------------------------------------------ template<class BACKEND> void print_results(const size_t num_times) { for (size_t i = 0, ie = num_times + 1; i < ie; i++) { std::cout << i << " "; for (void *buffer : result_buffers) { std::cout << *(static_cast<typename BACKEND::base *> (buffer) + i) << " "; } std::cout << std::endl; } std::cout << std::endl; } }; } Loading