diff --git a/.gitignore b/.gitignore index 30944692f3e9965ae406199abc5a3a2bca9196da..0b22c144133191a0f984e9cd410c92f9cfad0f0b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,6 @@ .DS_Store build +LLVM +graph_framework.xcodeproj/project.xcworkspace/xcuserdata/ +graph_framework.xcodeproj/xcshareddata/xcschemes/graph_playground.xcscheme +graph_framework.xcodeproj/xcuserdata diff --git a/graph_docs/kernel_optimization.dox b/graph_docs/kernel_optimization.dox new file mode 100644 index 0000000000000000000000000000000000000000..971619a7be1fb08e3c85361d7a305f121ba80345 --- /dev/null +++ b/graph_docs/kernel_optimization.dox @@ -0,0 +1,141 @@ +/*! + * @page kernel_optimization Kernel Optimization + * @brief Notes on optimizing kernel performance. + * @tableofcontents + * @section kernel_optimization_introduction Introduction + * There is a balance kernel run time, compile time, and kernel source code + * size. To discuss these issues we can explore a case study for a simple field + * solve of a Particle In Cell (PIC) code. The field solve involves computing + * the contribution of a every particle to fields. Fields are typically + * discretized onto a grid which will have a smaller size compared to the number + * of particles. + * + * @section kernel_optimization_source Source Code + * Take the example case. + * @code +template +void field_solve_example() { + timing::measure_diagnostic init("Init"); + const size_t batch = 1; + const size_t num_particles = 1000000; + auto particle_positions = graph::variable (num_particles, "x"); + + const size_t num_grid = 1000; + auto grid_value = graph::variable (num_grid, "g"); + auto particle_index = graph::variable (num_grid, "i"); + auto gird_position = graph::variable (num_grid, "gx"); + + std::vector buffer(1000, static_cast (0.0)); + grid_value->set(buffer); + particle_index->set(buffer); + + for (size_t index = 0; index < num_grid; index++) { + buffer[index] = static_cast (2*index)/static_cast (999) + - static_cast (1); + } + gx->set(buffer); + + auto indexed_particle = graph::index_1D(particle_positions, + particle_index, + static_cast (1), + static_cast (0)); + auto next_index = particle_index + static_cast (1.0); + auto arg = indexed_particle - gird_position; + auto next_grid_value = grid_value + + graph::exp(static_cast (-1)*arg*arg/static_cast (10)); + +// Unroll kernel call loop by running multiple iterations in a single kernel +// call. + for (size_t i = 1; i < batch; i++) { + indexed_particle = graph::index_1D(particle_positions, + particle_index, + static_cast (1), + static_cast (0)); + next_index = next_index + static_cast (1.0); + arg = indexed_particle - gird_position; + next_grid_value = next_grid_value + + graph::exp(static_cast (-1)*arg*arg/static_cast (10)); + } + + auto state = graph::random_state (jit::context::random_state_size, 0); + auto random = graph::random (graph::random_state_cast(state)); + const T max = 1.0; + const T min = -1.0; + auto random_real = (max - min)/graph::random_scale ()*random + min; + + init.print(); + + timing::measure_diagnostic compile("compile"); + + workflow::manager work(0); + work.add_preitem({ + graph::variable_cast(particle_positions) + }, {}, { + {random_real, variable_cast(particle_positions)} + }, graph::random_state_cast(state), "Initialize_x", num_particles); + + work.add_item({ + graph::variable_cast(particle_positions), + graph::variable_cast(indexed_particle), + graph::variable_cast(grid_value), + graph::variable_cast(gird_position) + }, {}, { + {next_index, graph::variable_cast(indexed_particle)}, + {next_grid_value, graph::variable_cast(grid_value)} + }, NULL, "Set_Grid", num_grid); + + work.compile(); + + compile.print(); + + work.pre_run(); + timing::measure_diagnostic field_solve("field_solve"); + for (size_t index = 0, ie = num_particles/batch; index < ie; index++) { + work.run(); + } + work.wait(); + field_solve.print(); +} + @endcode + * + * In this example we build two kernels. The Initialize_x kernel set + * random values between @f$1@f$ and @f$-1@f$ for particle positions to + * represent a particle state after a particle push step. This kernel is applied + * over each of the @f$1\times10^{6}@f$ particles. + * + * The Set_Grid kernel mimics a simple field. For a given particle + * position @f$x@f$, the contribution to grid point is defined as + * @f{equation}{g_{next}=g+e^{-\frac{\left(x\left[i\right]-gx\right)^{2}}{10}}@f} + * where @f$i@f$ is the particle index, @f$g@f$ is the current grid value, + * @f$gx@f$ is the grid position, and @f$g_{next}@f$ is the updated value. After + * each kernel call, the particle index is incremented. This kernel is applied + * over each of the @f$1\times10^{3}@f$ gird points. + * + * @section kernel_optimization_performance Kernel Performance + * @image{} html kernel_optimization.png "Wall time scaling comparing compile time and run time for the number of loop unroll iterations." + * In this section we focus on the Set_Grid kernel and the + * batch. When batch = 1, the kernel only adds the kernel + * contribution for a single particle. To compute the full field, we would need + * to call this kernel @f$1\times10^{6}@f$. Setting batch > 1 + * allows us to unroll the kernel call loop. This reduces the number of times we + * need to call the kernel but comes at the expense of a larger kernel and an + * associated compile penalty. If we were to set batch = 1000000 + * then we would fully unroll the loop resulting in only needing to call the + * kernel once. However it would come at the expense of the generated kernel + * source having over @f$1\times10^{6}@f$ lines of code. In testing, loop + * unrolling beyond @f$1\times10^{5}@f$ resulted in a segmentation fault. + * + * The figure above, shows the scaling of the setup time for the graphs, the + * compile time for the kernel and the resulting run time. A benchmark was + * performed on a Apple M1 Pro for both float and double precision. Note that + * the GPU on the M1 Pro only support single precision. CPU benchmark was + * performed on a single core. + * + * Unrolling the loop up to @f$500@f$ iterations shows a significant reduction + * in the GPU kernel run time with a negligible affect on the initialization and + * kernel compile time. Beyond @f$1000@f$ iterations, the improvement to kernel + * runtime becomes negligible. However, any gaines we made in runtime + * improvement is lost due to penalty to kernel compiling. For CPU execution + * there is little to no noticeable performance improvement when unrolling + * kernel call loops. + */ diff --git a/graph_docs/kernel_optimization.png b/graph_docs/kernel_optimization.png new file mode 100644 index 0000000000000000000000000000000000000000..03172cd05dcaa01a040443c18fd182065a267f3c Binary files /dev/null and b/graph_docs/kernel_optimization.png differ diff --git a/graph_docs/main.dox b/graph_docs/main.dox index d7eda1d705bfe77eb899b8f8ddeec217275c401b..00ee75180430f6ba5f01905b02893628ec7a1733 100644 --- a/graph_docs/main.dox +++ b/graph_docs/main.dox @@ -47,6 +47,7 @@ * * @ref tutorial "Tutorial" * * @ref graph_c_binding * * @ref graph_fortran_binding + * * @ref kernel_optimization * *
* @section framework_developer Framework developer guides diff --git a/graph_framework.xcodeproj/project.pbxproj b/graph_framework.xcodeproj/project.pbxproj index 57c3d1388019a03f8b089d0a41609c86413408c7..5558843e59ea8685f3d61195e197d8ce13dcbe7b 100644 --- a/graph_framework.xcodeproj/project.pbxproj +++ b/graph_framework.xcodeproj/project.pbxproj @@ -364,6 +364,7 @@ C75C42952E5CC80B00B0950B /* tutorial.dox */ = {isa = PBXFileReference; lastKnownFileType = text; path = tutorial.dox; sourceTree = ""; }; C760B1AB2BC6D760001737A3 /* get_includes.py */ = {isa = PBXFileReference; lastKnownFileType = text.script.python; path = get_includes.py; sourceTree = ""; }; C7678FBD2B45C2850025F37E /* bin.py */ = {isa = PBXFileReference; lastKnownFileType = text.script.python; path = bin.py; sourceTree = ""; }; + C77707F62F5F288B00BA4E87 /* kernel_optimization.dox */ = {isa = PBXFileReference; lastKnownFileType = text; path = kernel_optimization.dox; sourceTree = ""; }; C77E6DF522DD64E700469621 /* trigonometry.hpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.h; path = trigonometry.hpp; sourceTree = ""; }; C78F3D872DC122B1002E3D94 /* CMakeLists.txt */ = {isa = PBXFileReference; lastKnownFileType = text; path = CMakeLists.txt; sourceTree = ""; }; C78F3D882DC122B1002E3D94 /* xkorc.cpp */ = {isa = PBXFileReference; explicitFileType = sourcecode.cpp.objcpp; path = xkorc.cpp; sourceTree = ""; }; @@ -627,6 +628,7 @@ C75C42952E5CC80B00B0950B /* tutorial.dox */, C7DD87D32E664B440058BA66 /* code_structure.dox */, C729B8B12ED7521A00A2559D /* discription.dox */, + C77707F62F5F288B00BA4E87 /* kernel_optimization.dox */, C729B8B22ED7536B00A2559D /* use_cases.dox */, C729B8B42ED77CA800A2559D /* code_performance.dox */, ); diff --git a/graph_framework/cuda_context.hpp b/graph_framework/cuda_context.hpp index dc660a09e2ea5b6a0cd02787028a746abeb8c34c..741bafa234a32a13e1861d4c3d025a113af8659f 100644 --- a/graph_framework/cuda_context.hpp +++ b/graph_framework/cuda_context.hpp @@ -324,6 +324,7 @@ namespace gpu { check_error(cuModuleGetFunction(&function, module, kernel_name.c_str()), "cuModuleGetFunction"); std::vector buffers; + std::set *> needed_buffers; const size_t buffer_element_size = sizeof(T); for (auto &input : inputs) { @@ -339,6 +340,11 @@ namespace gpu { backend.size()*sizeof(T)), "cuMemcpyHtoD"); buffers.push_back(reinterpret_cast (&kernel_arguments[input.get()])); + needed_buffers.insert(input.get()); + } + if (!needed_buffers.contains(input.get())) { + buffers.push_back(kernel_arguments[input.get()]); + needed_buffers.insert(input.get()); } } for (auto &output : outputs) { @@ -349,6 +355,11 @@ namespace gpu { CU_MEM_ATTACH_GLOBAL), "cuMemAllocManaged"); buffers.push_back(reinterpret_cast (&kernel_arguments[output.get()])); + needed_buffers.insert(output.get()); + } + if (!needed_buffers.contains(output.get())) { + buffers.push_back(kernel_arguments[output.get()]); + needed_buffers.insert(output.get()); } } diff --git a/graph_framework/metal_context.hpp b/graph_framework/metal_context.hpp index 5e0bc964c49e0917a32eac8b4d1768300db51f2e..fd91bee47a8392d931a9d19e891b7e56f0b389b2 100644 --- a/graph_framework/metal_context.hpp +++ b/graph_framework/metal_context.hpp @@ -138,6 +138,7 @@ namespace gpu { } std::vector> buffers; + std::set *> needed_buffers; const size_t buffer_element_size = sizeof(float); for (graph::shared_variable &input : inputs) { @@ -147,6 +148,11 @@ namespace gpu { length:buffer.size()*buffer_element_size options:MTLResourceStorageModeShared]; buffers.push_back(kernel_arguments[input.get()]); + needed_buffers.insert(input.get()); + } + if (!needed_buffers.contains(input.get())) { + buffers.push_back(kernel_arguments[input.get()]); + needed_buffers.insert(input.get()); } } for (graph::shared_leaf &output : outputs) { @@ -154,6 +160,11 @@ namespace gpu { kernel_arguments[output.get()] = [device newBufferWithLength:num_rays*sizeof(float) options:MTLResourceStorageModeShared]; buffers.push_back(kernel_arguments[output.get()]); + needed_buffers.insert(output.get()); + } + if (!needed_buffers.contains(output.get())) { + buffers.push_back(kernel_arguments[output.get()]); + needed_buffers.insert(output.get()); } } if (state.get()) {