Commit 971dab56 authored by Cianciosa, Mark's avatar Cianciosa, Mark
Browse files

Add a case study on kernel performance for PIC code field solve like kernel....

Add a case study on kernel performance for PIC code field solve like kernel. This case study shows the affects of loop unrolling on run time performance on the kernels. Partial loop unrolling on the GPU shows a significant performance imporvement on GPU. This may be an effective way to impliment the field solve on the GPU without needing to create a loop or sum nodes in the graph. However, we may need to impliment MAX, MIN nodes or If/Else nodes to deal with functionality involving grid bounds and particle reinjection.
parent 90e5a05b
Loading
Loading
Loading
Loading
+4 −0
Original line number Diff line number Diff line
.DS_Store
build
LLVM
graph_framework.xcodeproj/project.xcworkspace/xcuserdata/
graph_framework.xcodeproj/xcshareddata/xcschemes/graph_playground.xcscheme
graph_framework.xcodeproj/xcuserdata
+141 −0
Original line number Diff line number Diff line
/*!
 * @page kernel_optimization Kernel Optimization
 * @brief Notes on optimizing kernel performance.
 * @tableofcontents
 * @section kernel_optimization_introduction Introduction
 * There is a balance kernel run time, compile time, and kernel source code
 * size. To discuss these issues we can explore a case study for a simple field
 * solve of a Particle In Cell (PIC) code. The field solve involves computing
 * the contribution of a every particle to fields. Fields are typically
 * discretized onto a grid which will have a smaller size compared to the number
 * of particles.
 *
 * @section kernel_optimization_source Source Code
 * Take the example case.
 * @code
template<jit::float_scalar T>
void field_solve_example() {
    timing::measure_diagnostic init("Init");
    const size_t batch = 1;
    const size_t num_particles = 1000000;
    auto particle_positions = graph::variable<T> (num_particles, "x");

    const size_t num_grid = 1000;
    auto grid_value = graph::variable<T> (num_grid, "g");
    auto particle_index = graph::variable<T> (num_grid, "i");
    auto gird_position = graph::variable<T> (num_grid, "gx");

    std::vector<T> buffer(1000, static_cast<T> (0.0));
    grid_value->set(buffer);
    particle_index->set(buffer);

    for (size_t index = 0; index < num_grid; index++) {
        buffer[index] = static_cast<T> (2*index)/static_cast<T> (999)
                      - static_cast<T> (1);
    }
    gx->set(buffer);
    
    auto indexed_particle = graph::index_1D(particle_positions,
                                            particle_index,
                                            static_cast<T> (1),
                                            static_cast<T> (0));
    auto next_index = particle_index + static_cast<T> (1.0);
    auto arg = indexed_particle - gird_position;
    auto next_grid_value = grid_value
                         + graph::exp(static_cast<T> (-1)*arg*arg/static_cast<T> (10));

//  Unroll kernel call loop by running multiple iterations in a single kernel
//  call.
    for (size_t i = 1; i < batch; i++) {
        indexed_particle = graph::index_1D(particle_positions,
                                           particle_index,
                                           static_cast<T> (1),
                                           static_cast<T> (0));
        next_index = next_index + static_cast<T> (1.0);
        arg = indexed_particle - gird_position;
        next_grid_value = next_grid_value
                        + graph::exp(static_cast<T> (-1)*arg*arg/static_cast<T> (10));
    }

    auto state = graph::random_state<T> (jit::context<T>::random_state_size, 0);
    auto random = graph::random<T> (graph::random_state_cast(state));
    const T max = 1.0;
    const T min = -1.0;
    auto random_real = (max - min)/graph::random_scale<T> ()*random + min;

    init.print();

    timing::measure_diagnostic compile("compile");

    workflow::manager<T> work(0);
    work.add_preitem({
        graph::variable_cast(particle_positions)
    }, {}, {
        {random_real, variable_cast(particle_positions)}
    }, graph::random_state_cast(state), "Initialize_x", num_particles);

    work.add_item({
        graph::variable_cast(particle_positions),
        graph::variable_cast(indexed_particle),
        graph::variable_cast(grid_value),
        graph::variable_cast(gird_position)
    }, {}, {
        {next_index, graph::variable_cast(indexed_particle)},
        {next_grid_value, graph::variable_cast(grid_value)}
    }, NULL, "Set_Grid", num_grid);

    work.compile();

    compile.print();

    work.pre_run();
    timing::measure_diagnostic field_solve("field_solve");
    for (size_t index = 0, ie = num_particles/batch; index < ie; index++) {
        work.run();
    }
    work.wait();
    field_solve.print();
}
 @endcode
 *
 * In this example we build two kernels. The <tt>Initialize_x</tt> kernel set
 * random values between @f$1@f$ and @f$-1@f$ for particle positions to
 * represent a particle state after a particle push step. This kernel is applied
 * over each of the @f$1\times10^{6}@f$ particles.
 *
 * The <tt>Set_Grid</tt> kernel mimics a simple field. For a given particle
 * position @f$x@f$, the contribution to grid point is defined as
 * @f{equation}{g_{next}=g+e^{-\frac{\left(x\left[i\right]-gx\right)^{2}}{10}}@f}
 * where @f$i@f$ is the particle index, @f$g@f$ is the current grid value,
 * @f$gx@f$ is the grid position, and @f$g_{next}@f$ is the updated value. After
 * each kernel call, the particle index is incremented. This kernel is applied
 * over each of the @f$1\times10^{3}@f$ gird points.
 *
 * @section kernel_optimization_performance Kernel Performance
 * @image{} html kernel_optimization.png "Wall time scaling comparing compile time and run time for the number of loop unroll iterations."
 * In this section we focus on the <tt>Set_Grid</tt> kernel and the
 * <tt>batch</tt>. When <tt>batch = 1</tt>, the kernel only adds the kernel
 * contribution for a single particle. To compute the full field, we would need
 * to call this kernel @f$1\times10^{6}@f$. Setting <tt>batch > 1</tt>
 * allows us to unroll the kernel call loop. This reduces the number of times we
 * need to call the kernel but comes at the expense of a larger kernel and an
 * associated compile penalty. If we were to set <tt>batch = 1000000</tt>
 * then we would fully unroll the loop resulting in only needing to call the
 * kernel once. However it would come at the expense of the generated kernel
 * source having over @f$1\times10^{6}@f$ lines of code. In testing, loop
 * unrolling beyond @f$1\times10^{5}@f$ resulted in a segmentation fault.
 *
 * The figure above, shows the scaling of the setup time for the graphs, the
 * compile time for the kernel and the resulting run time. A benchmark was
 * performed on a Apple M1 Pro for both float and double precision. Note that
 * the GPU on the M1 Pro only support single precision. CPU benchmark was
 * performed on a single core.
 *
 * Unrolling the loop up to @f$500@f$ iterations shows a significant reduction
 * in the GPU kernel run time with a negligible affect on the initialization and
 * kernel compile time. Beyond @f$1000@f$ iterations, the improvement to kernel
 * runtime becomes negligible. However, any gaines we made in runtime
 * improvement is lost due to penalty to kernel compiling. For CPU execution
 * there is little to no noticeable performance improvement when unrolling
 * kernel call loops.
 */
+61.2 KiB
Loading image diff...
+1 −0
Original line number Diff line number Diff line
@@ -47,6 +47,7 @@
 * * @ref tutorial "Tutorial"
 * * @ref graph_c_binding
 * * @ref graph_fortran_binding
 * * @ref kernel_optimization
 *
 * <hr>
 * @section framework_developer Framework developer guides
+2 −0
Original line number Diff line number Diff line
@@ -364,6 +364,7 @@
		C75C42952E5CC80B00B0950B /* tutorial.dox */ = {isa = PBXFileReference; lastKnownFileType = text; path = tutorial.dox; sourceTree = "<group>"; };
		C760B1AB2BC6D760001737A3 /* get_includes.py */ = {isa = PBXFileReference; lastKnownFileType = text.script.python; path = get_includes.py; sourceTree = "<group>"; };
		C7678FBD2B45C2850025F37E /* bin.py */ = {isa = PBXFileReference; lastKnownFileType = text.script.python; path = bin.py; sourceTree = "<group>"; };
		C77707F62F5F288B00BA4E87 /* kernel_optimization.dox */ = {isa = PBXFileReference; lastKnownFileType = text; path = kernel_optimization.dox; sourceTree = "<group>"; };
		C77E6DF522DD64E700469621 /* trigonometry.hpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.h; path = trigonometry.hpp; sourceTree = "<group>"; };
		C78F3D872DC122B1002E3D94 /* CMakeLists.txt */ = {isa = PBXFileReference; lastKnownFileType = text; path = CMakeLists.txt; sourceTree = "<group>"; };
		C78F3D882DC122B1002E3D94 /* xkorc.cpp */ = {isa = PBXFileReference; explicitFileType = sourcecode.cpp.objcpp; path = xkorc.cpp; sourceTree = "<group>"; };
@@ -627,6 +628,7 @@
				C75C42952E5CC80B00B0950B /* tutorial.dox */,
				C7DD87D32E664B440058BA66 /* code_structure.dox */,
				C729B8B12ED7521A00A2559D /* discription.dox */,
				C77707F62F5F288B00BA4E87 /* kernel_optimization.dox */,
				C729B8B22ED7536B00A2559D /* use_cases.dox */,
				C729B8B42ED77CA800A2559D /* code_performance.dox */,
			);
Loading