Loading graph_framework/cuda_context.hpp +8 −37 Original line number Diff line number Diff line Loading @@ -41,8 +41,6 @@ namespace gpu { unsigned int thread_groups; /// Number of threads in a group. unsigned int threads_per_group; /// Result buffers. std::vector<CUdeviceptr> result_buffers; /// Index offset. size_t buffer_offset; /// Buffer element size. Loading Loading @@ -208,8 +206,6 @@ namespace gpu { check_error(cuMemAlloc(&buffers[i], backend.size()*buffer_element_size), "cuMemAlloc"); check_error(cuMemcpyHtoD(buffers[i], &backend[0], backend.size()*buffer_element_size), "cuMemcpyHtoD"); kernel_arguments.push_back(reinterpret_cast<void *> (&buffers[i])); check_error(cuMemAllocManaged(&result_buffers[i], result_size, CU_MEM_ATTACH_GLOBAL), "cuMemAllocManaged"); } for (size_t i = inputs.size(), ie = buffers.size(), j = 0; i < ie; i++, j++) { const BACKEND backend = outputs[j]->evaluate(); Loading @@ -217,8 +213,6 @@ namespace gpu { check_error(cuMemAlloc(&buffers[i], backend.size()*buffer_element_size), "cuMemAlloc"); check_error(cuMemcpyHtoD(buffers[i], &backend[0], backend.size()*buffer_element_size), "cuMemcpyHtoD"); kernel_arguments.push_back(reinterpret_cast<void *> (&buffers[i])); check_error(cuMemAllocManaged(&result_buffers[i], result_size, CU_MEM_ATTACH_GLOBAL), "cuMemAllocManaged"); } int value; Loading @@ -229,25 +223,6 @@ namespace gpu { std::cout << " Threads per group : " << threads_per_group << std::endl; std::cout << " Number of groups : " << thread_groups << std::endl; std::cout << " Total problem size : " << threads_per_group*thread_groups << std::endl; encode_blit(); } //------------------------------------------------------------------------------ /// @brief Encode a blit command to the stream. /// /// blit is the metal terminology for a memcopy operation added to the command /// stream. Don't know what the cuda term is. //------------------------------------------------------------------------------ void encode_blit() { for (size_t i = 0, ie = buffers.size(); i < ie; i++) { check_error_async(cuMemcpyDtoDAsync(result_buffers[i] + time_offset, buffers[i] + buffer_offset, buffer_element_size, stream), "check_error_async"); } time_offset += buffer_element_size; } //------------------------------------------------------------------------------ Loading @@ -261,7 +236,6 @@ namespace gpu { threads_per_group, 1, 1, 0, stream, kernel_arguments.data(), NULL), "cuLaunchKernel"); encode_blit(); } //------------------------------------------------------------------------------ Loading @@ -269,22 +243,19 @@ namespace gpu { //------------------------------------------------------------------------------ void wait() { check_error_async(cuStreamSynchronize(stream), "cuStreamSynchronize"); check_error(cuCtxSynchronize(), "cuCtxSynchronize"); } //------------------------------------------------------------------------------ /// @brief Print out the results. /// /// @param[in] num_times Number of times to record. /// @param[in] index Number of times to record. //------------------------------------------------------------------------------ template<class BACKEND> void print_results(const size_t num_times) { check_error(cuCtxSynchronize(), "cuCtxSynchronize"); for (size_t i = 0, ie = num_times + 1; i < ie; i++) { std::cout << i << " "; for (CUdeviceptr &buffer : result_buffers) { std::cout << reinterpret_cast<typename BACKEND::base *> (buffer)[i] << " "; } std::cout << std::endl; void print_results(const size_t index) { wait(); for (CUdeviceptr &buffer : buffers) { std::cout << reinterpret_cast<typename BACKEND::base *> (buffer)[index] << " "; } std::cout << std::endl; } Loading graph_framework/jit.hpp +4 −3 Original line number Diff line number Diff line Loading @@ -245,13 +245,14 @@ namespace jit { const timeing::measure_diagnostic gpu_time("GPU Time"); context.print_results<BACKEND> (0); for (size_t i = 0; i < num_steps; i++) { context.step(); context.print_results<BACKEND> (0); } context.wait(); gpu_time.stop(); context.print_results<BACKEND> (num_steps); gpu_time.stop(); } }; } Loading graph_framework/metal_context.hpp +8 −40 Original line number Diff line number Diff line Loading @@ -34,8 +34,6 @@ namespace gpu { NSUInteger thread_groups; /// Number of threads in a group. NSUInteger threads_per_group; /// Result buffers. std::vector<id<MTLBuffer>> result_buffers; /// Index offset. size_t buffer_offset; /// Buffer element size. Loading Loading @@ -105,16 +103,12 @@ namespace gpu { buffers.push_back([device newBufferWithBytes:&backend[0] length:backend.size()*buffer_element_size options:MTLResourceStorageModeManaged]); result_buffers.push_back([device newBufferWithLength:num_times*buffer_element_size options:MTLResourceStorageModeManaged]); } for (graph::shared_leaf<BACKEND> &output : outputs) { const BACKEND backend = output->evaluate(); buffers.push_back([device newBufferWithBytes:&backend[0] length:backend.size()*buffer_element_size options:MTLResourceStorageModeManaged]); result_buffers.push_back([device newBufferWithLength:num_times*buffer_element_size options:MTLResourceStorageModeManaged]); } threads_per_group = state.maxTotalThreadsPerThreadgroup; Loading @@ -123,10 +117,6 @@ namespace gpu { std::cout << " Threads per group : " << threads_per_group << std::endl; std::cout << " Number of groups : " << thread_groups << std::endl; std::cout << " Total problem size : " << threads_per_group*thread_groups << std::endl; command_buffer = [queue commandBuffer]; encode_blit(); [command_buffer commit]; } } Loading @@ -139,23 +129,6 @@ namespace gpu { return options; } //------------------------------------------------------------------------------ /// @brief Encode a blit command. //------------------------------------------------------------------------------ void encode_blit() { id<MTLBlitCommandEncoder> blit = [command_buffer blitCommandEncoder]; for (size_t i = 0, ie = buffers.size(); i < ie; i++) { [blit copyFromBuffer:buffers[i] sourceOffset:buffer_offset toBuffer:result_buffers[i] destinationOffset:time_offset size:buffer_element_size]; } [blit endEncoding]; time_offset += buffer_element_size; } //------------------------------------------------------------------------------ /// @brief Perform a time step. /// Loading @@ -177,8 +150,6 @@ namespace gpu { threadsPerThreadgroup:MTLSizeMake(threads_per_group, 1, 1)]; [encoder endEncoding]; encode_blit(); [command_buffer commit]; } } Loading @@ -189,8 +160,8 @@ namespace gpu { void wait() { command_buffer = [queue commandBuffer]; id<MTLBlitCommandEncoder> blit = [command_buffer blitCommandEncoder]; for (size_t i = 0, ie = buffers.size(); i < ie; i++) { [blit synchronizeResource:result_buffers[i]]; for (id<MTLBuffer> buffer : buffers) { [blit synchronizeResource:buffer]; } [blit endEncoding]; Loading @@ -201,17 +172,14 @@ namespace gpu { //------------------------------------------------------------------------------ /// @brief Print out the results. /// /// @param[in] num_times Number of times to record. /// @param[in] index Particle index to print. //------------------------------------------------------------------------------ template<class BACKEND> void print_results(const size_t num_times) { for (size_t i = 0, ie = num_times + 1; i < ie; i++) { std::cout << i << " "; for (id<MTLBuffer> buffer : result_buffers) { void print_results(const size_t index) { wait(); for (id<MTLBuffer> buffer : buffers) { const typename BACKEND::base *contents = static_cast<typename BACKEND::base *> ([buffer contents]); std::cout << contents[i] << " "; } std::cout << std::endl; std::cout << contents[index] << " "; } std::cout << std::endl; } Loading Loading
graph_framework/cuda_context.hpp +8 −37 Original line number Diff line number Diff line Loading @@ -41,8 +41,6 @@ namespace gpu { unsigned int thread_groups; /// Number of threads in a group. unsigned int threads_per_group; /// Result buffers. std::vector<CUdeviceptr> result_buffers; /// Index offset. size_t buffer_offset; /// Buffer element size. Loading Loading @@ -208,8 +206,6 @@ namespace gpu { check_error(cuMemAlloc(&buffers[i], backend.size()*buffer_element_size), "cuMemAlloc"); check_error(cuMemcpyHtoD(buffers[i], &backend[0], backend.size()*buffer_element_size), "cuMemcpyHtoD"); kernel_arguments.push_back(reinterpret_cast<void *> (&buffers[i])); check_error(cuMemAllocManaged(&result_buffers[i], result_size, CU_MEM_ATTACH_GLOBAL), "cuMemAllocManaged"); } for (size_t i = inputs.size(), ie = buffers.size(), j = 0; i < ie; i++, j++) { const BACKEND backend = outputs[j]->evaluate(); Loading @@ -217,8 +213,6 @@ namespace gpu { check_error(cuMemAlloc(&buffers[i], backend.size()*buffer_element_size), "cuMemAlloc"); check_error(cuMemcpyHtoD(buffers[i], &backend[0], backend.size()*buffer_element_size), "cuMemcpyHtoD"); kernel_arguments.push_back(reinterpret_cast<void *> (&buffers[i])); check_error(cuMemAllocManaged(&result_buffers[i], result_size, CU_MEM_ATTACH_GLOBAL), "cuMemAllocManaged"); } int value; Loading @@ -229,25 +223,6 @@ namespace gpu { std::cout << " Threads per group : " << threads_per_group << std::endl; std::cout << " Number of groups : " << thread_groups << std::endl; std::cout << " Total problem size : " << threads_per_group*thread_groups << std::endl; encode_blit(); } //------------------------------------------------------------------------------ /// @brief Encode a blit command to the stream. /// /// blit is the metal terminology for a memcopy operation added to the command /// stream. Don't know what the cuda term is. //------------------------------------------------------------------------------ void encode_blit() { for (size_t i = 0, ie = buffers.size(); i < ie; i++) { check_error_async(cuMemcpyDtoDAsync(result_buffers[i] + time_offset, buffers[i] + buffer_offset, buffer_element_size, stream), "check_error_async"); } time_offset += buffer_element_size; } //------------------------------------------------------------------------------ Loading @@ -261,7 +236,6 @@ namespace gpu { threads_per_group, 1, 1, 0, stream, kernel_arguments.data(), NULL), "cuLaunchKernel"); encode_blit(); } //------------------------------------------------------------------------------ Loading @@ -269,22 +243,19 @@ namespace gpu { //------------------------------------------------------------------------------ void wait() { check_error_async(cuStreamSynchronize(stream), "cuStreamSynchronize"); check_error(cuCtxSynchronize(), "cuCtxSynchronize"); } //------------------------------------------------------------------------------ /// @brief Print out the results. /// /// @param[in] num_times Number of times to record. /// @param[in] index Number of times to record. //------------------------------------------------------------------------------ template<class BACKEND> void print_results(const size_t num_times) { check_error(cuCtxSynchronize(), "cuCtxSynchronize"); for (size_t i = 0, ie = num_times + 1; i < ie; i++) { std::cout << i << " "; for (CUdeviceptr &buffer : result_buffers) { std::cout << reinterpret_cast<typename BACKEND::base *> (buffer)[i] << " "; } std::cout << std::endl; void print_results(const size_t index) { wait(); for (CUdeviceptr &buffer : buffers) { std::cout << reinterpret_cast<typename BACKEND::base *> (buffer)[index] << " "; } std::cout << std::endl; } Loading
graph_framework/jit.hpp +4 −3 Original line number Diff line number Diff line Loading @@ -245,13 +245,14 @@ namespace jit { const timeing::measure_diagnostic gpu_time("GPU Time"); context.print_results<BACKEND> (0); for (size_t i = 0; i < num_steps; i++) { context.step(); context.print_results<BACKEND> (0); } context.wait(); gpu_time.stop(); context.print_results<BACKEND> (num_steps); gpu_time.stop(); } }; } Loading
graph_framework/metal_context.hpp +8 −40 Original line number Diff line number Diff line Loading @@ -34,8 +34,6 @@ namespace gpu { NSUInteger thread_groups; /// Number of threads in a group. NSUInteger threads_per_group; /// Result buffers. std::vector<id<MTLBuffer>> result_buffers; /// Index offset. size_t buffer_offset; /// Buffer element size. Loading Loading @@ -105,16 +103,12 @@ namespace gpu { buffers.push_back([device newBufferWithBytes:&backend[0] length:backend.size()*buffer_element_size options:MTLResourceStorageModeManaged]); result_buffers.push_back([device newBufferWithLength:num_times*buffer_element_size options:MTLResourceStorageModeManaged]); } for (graph::shared_leaf<BACKEND> &output : outputs) { const BACKEND backend = output->evaluate(); buffers.push_back([device newBufferWithBytes:&backend[0] length:backend.size()*buffer_element_size options:MTLResourceStorageModeManaged]); result_buffers.push_back([device newBufferWithLength:num_times*buffer_element_size options:MTLResourceStorageModeManaged]); } threads_per_group = state.maxTotalThreadsPerThreadgroup; Loading @@ -123,10 +117,6 @@ namespace gpu { std::cout << " Threads per group : " << threads_per_group << std::endl; std::cout << " Number of groups : " << thread_groups << std::endl; std::cout << " Total problem size : " << threads_per_group*thread_groups << std::endl; command_buffer = [queue commandBuffer]; encode_blit(); [command_buffer commit]; } } Loading @@ -139,23 +129,6 @@ namespace gpu { return options; } //------------------------------------------------------------------------------ /// @brief Encode a blit command. //------------------------------------------------------------------------------ void encode_blit() { id<MTLBlitCommandEncoder> blit = [command_buffer blitCommandEncoder]; for (size_t i = 0, ie = buffers.size(); i < ie; i++) { [blit copyFromBuffer:buffers[i] sourceOffset:buffer_offset toBuffer:result_buffers[i] destinationOffset:time_offset size:buffer_element_size]; } [blit endEncoding]; time_offset += buffer_element_size; } //------------------------------------------------------------------------------ /// @brief Perform a time step. /// Loading @@ -177,8 +150,6 @@ namespace gpu { threadsPerThreadgroup:MTLSizeMake(threads_per_group, 1, 1)]; [encoder endEncoding]; encode_blit(); [command_buffer commit]; } } Loading @@ -189,8 +160,8 @@ namespace gpu { void wait() { command_buffer = [queue commandBuffer]; id<MTLBlitCommandEncoder> blit = [command_buffer blitCommandEncoder]; for (size_t i = 0, ie = buffers.size(); i < ie; i++) { [blit synchronizeResource:result_buffers[i]]; for (id<MTLBuffer> buffer : buffers) { [blit synchronizeResource:buffer]; } [blit endEncoding]; Loading @@ -201,17 +172,14 @@ namespace gpu { //------------------------------------------------------------------------------ /// @brief Print out the results. /// /// @param[in] num_times Number of times to record. /// @param[in] index Particle index to print. //------------------------------------------------------------------------------ template<class BACKEND> void print_results(const size_t num_times) { for (size_t i = 0, ie = num_times + 1; i < ie; i++) { std::cout << i << " "; for (id<MTLBuffer> buffer : result_buffers) { void print_results(const size_t index) { wait(); for (id<MTLBuffer> buffer : buffers) { const typename BACKEND::base *contents = static_cast<typename BACKEND::base *> ([buffer contents]); std::cout << contents[i] << " "; } std::cout << std::endl; std::cout << contents[index] << " "; } std::cout << std::endl; } Loading