Add a case study on kernel performance for PIC code field solve like kernel.... (971dab56) · Commits · Cianciosa, Mark / graph_framework

.gitignore

+4 −0

Original line number	Diff line number	Diff line
		.DS_Store
		build
		LLVM
		graph_framework.xcodeproj/project.xcworkspace/xcuserdata/
		graph_framework.xcodeproj/xcshareddata/xcschemes/graph_playground.xcscheme
		graph_framework.xcodeproj/xcuserdata

graph_docs/kernel_optimization.dox

0 → 100644

+141 −0

Original line number	Diff line number	Diff line
		/*!
		* @page kernel_optimization Kernel Optimization
		* @brief Notes on optimizing kernel performance.
		* @tableofcontents
		* @section kernel_optimization_introduction Introduction
		* There is a balance kernel run time, compile time, and kernel source code
		* size. To discuss these issues we can explore a case study for a simple field
		* solve of a Particle In Cell (PIC) code. The field solve involves computing
		* the contribution of a every particle to fields. Fields are typically
		* discretized onto a grid which will have a smaller size compared to the number
		* of particles.
		*
		* @section kernel_optimization_source Source Code
		* Take the example case.
		* @code
		template<jit::float_scalar T>
		void field_solve_example() {
		timing::measure_diagnostic init("Init");
		const size_t batch = 1;
		const size_t num_particles = 1000000;
		auto particle_positions = graph::variable<T> (num_particles, "x");

		const size_t num_grid = 1000;
		auto grid_value = graph::variable<T> (num_grid, "g");
		auto particle_index = graph::variable<T> (num_grid, "i");
		auto gird_position = graph::variable<T> (num_grid, "gx");

		std::vector<T> buffer(1000, static_cast<T> (0.0));
		grid_value->set(buffer);
		particle_index->set(buffer);

		for (size_t index = 0; index < num_grid; index++) {
		buffer[index] = static_cast<T> (2*index)/static_cast<T> (999)
		- static_cast<T> (1);
		}
		gx->set(buffer);

		auto indexed_particle = graph::index_1D(particle_positions,
		particle_index,
		static_cast<T> (1),
		static_cast<T> (0));
		auto next_index = particle_index + static_cast<T> (1.0);
		auto arg = indexed_particle - gird_position;
		auto next_grid_value = grid_value
		+ graph::exp(static_cast<T> (-1)argarg/static_cast<T> (10));

		// Unroll kernel call loop by running multiple iterations in a single kernel
		// call.
		for (size_t i = 1; i < batch; i++) {
		indexed_particle = graph::index_1D(particle_positions,
		particle_index,
		static_cast<T> (1),
		static_cast<T> (0));
		next_index = next_index + static_cast<T> (1.0);
		arg = indexed_particle - gird_position;
		next_grid_value = next_grid_value
		+ graph::exp(static_cast<T> (-1)argarg/static_cast<T> (10));
		}

		auto state = graph::random_state<T> (jit::context<T>::random_state_size, 0);
		auto random = graph::random<T> (graph::random_state_cast(state));
		const T max = 1.0;
		const T min = -1.0;
		auto random_real = (max - min)/graph::random_scale<T> ()*random + min;

		init.print();

		timing::measure_diagnostic compile("compile");

		workflow::manager<T> work(0);
		work.add_preitem({
		graph::variable_cast(particle_positions)
		}, {}, {
		{random_real, variable_cast(particle_positions)}
		}, graph::random_state_cast(state), "Initialize_x", num_particles);

		work.add_item({
		graph::variable_cast(particle_positions),
		graph::variable_cast(indexed_particle),
		graph::variable_cast(grid_value),
		graph::variable_cast(gird_position)
		}, {}, {
		{next_index, graph::variable_cast(indexed_particle)},
		{next_grid_value, graph::variable_cast(grid_value)}
		}, NULL, "Set_Grid", num_grid);

		work.compile();

		compile.print();

		work.pre_run();
		timing::measure_diagnostic field_solve("field_solve");
		for (size_t index = 0, ie = num_particles/batch; index < ie; index++) {
		work.run();
		}
		work.wait();
		field_solve.print();
		}
		@endcode
		*
		* In this example we build two kernels. The <tt>Initialize_x</tt> kernel set
		* random values between @f$1@f$ and @f$-1@f$ for particle positions to
		* represent a particle state after a particle push step. This kernel is applied
		* over each of the @f$1\times10^{6}@f$ particles.
		*
		* The <tt>Set_Grid</tt> kernel mimics a simple field. For a given particle
		* position @f$x@f$, the contribution to grid point is defined as
		* @f{equation}{g_{next}=g+e^{-\frac{\left(x\left[i\right]-gx\right)^{2}}{10}}@f}
		* where @f$i@f$ is the particle index, @f$g@f$ is the current grid value,
		* @f$gx@f$ is the grid position, and @f$g_{next}@f$ is the updated value. After
		* each kernel call, the particle index is incremented. This kernel is applied
		* over each of the @f$1\times10^{3}@f$ gird points.
		*
		* @section kernel_optimization_performance Kernel Performance
		* @image{} html kernel_optimization.png "Wall time scaling comparing compile time and run time for the number of loop unroll iterations."
		* In this section we focus on the <tt>Set_Grid</tt> kernel and the
		* <tt>batch</tt>. When <tt>batch = 1</tt>, the kernel only adds the kernel
		* contribution for a single particle. To compute the full field, we would need
		* to call this kernel @f$1\times10^{6}@f$. Setting <tt>batch > 1</tt>
		* allows us to unroll the kernel call loop. This reduces the number of times we
		* need to call the kernel but comes at the expense of a larger kernel and an
		* associated compile penalty. If we were to set <tt>batch = 1000000</tt>
		* then we would fully unroll the loop resulting in only needing to call the
		* kernel once. However it would come at the expense of the generated kernel
		* source having over @f$1\times10^{6}@f$ lines of code. In testing, loop
		* unrolling beyond @f$1\times10^{5}@f$ resulted in a segmentation fault.
		*
		* The figure above, shows the scaling of the setup time for the graphs, the
		* compile time for the kernel and the resulting run time. A benchmark was
		* performed on a Apple M1 Pro for both float and double precision. Note that
		* the GPU on the M1 Pro only support single precision. CPU benchmark was
		* performed on a single core.
		*
		* Unrolling the loop up to @f$500@f$ iterations shows a significant reduction
		* in the GPU kernel run time with a negligible affect on the initialization and
		* kernel compile time. Beyond @f$1000@f$ iterations, the improvement to kernel
		* runtime becomes negligible. However, any gaines we made in runtime
		* improvement is lost due to penalty to kernel compiling. For CPU execution
		* there is little to no noticeable performance improvement when unrolling
		* kernel call loops.
		*/

graph_docs/kernel_optimization.png

0 → 100644

+61.2 KiB

Loading image diff...

graph_docs/main.dox

+1 −0

Original line number	Diff line number	Diff line
		@@ -47,6 +47,7 @@
		* * @ref tutorial "Tutorial"
		* * @ref graph_c_binding
		* * @ref graph_fortran_binding
		* * @ref kernel_optimization
		*
		* <hr>
		* @section framework_developer Framework developer guides

graph_framework.xcodeproj/project.pbxproj

+2 −0

Original line number	Diff line number	Diff line
		@@ -364,6 +364,7 @@
		C75C42952E5CC80B00B0950B /* tutorial.dox */ = {isa = PBXFileReference; lastKnownFileType = text; path = tutorial.dox; sourceTree = "<group>"; };
		C760B1AB2BC6D760001737A3 /* get_includes.py */ = {isa = PBXFileReference; lastKnownFileType = text.script.python; path = get_includes.py; sourceTree = "<group>"; };
		C7678FBD2B45C2850025F37E /* bin.py */ = {isa = PBXFileReference; lastKnownFileType = text.script.python; path = bin.py; sourceTree = "<group>"; };
		C77707F62F5F288B00BA4E87 /* kernel_optimization.dox */ = {isa = PBXFileReference; lastKnownFileType = text; path = kernel_optimization.dox; sourceTree = "<group>"; };
		C77E6DF522DD64E700469621 /* trigonometry.hpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.h; path = trigonometry.hpp; sourceTree = "<group>"; };
		C78F3D872DC122B1002E3D94 /* CMakeLists.txt */ = {isa = PBXFileReference; lastKnownFileType = text; path = CMakeLists.txt; sourceTree = "<group>"; };
		C78F3D882DC122B1002E3D94 /* xkorc.cpp */ = {isa = PBXFileReference; explicitFileType = sourcecode.cpp.objcpp; path = xkorc.cpp; sourceTree = "<group>"; };
		@@ -627,6 +628,7 @@
		C75C42952E5CC80B00B0950B /* tutorial.dox */,
		C7DD87D32E664B440058BA66 /* code_structure.dox */,
		C729B8B12ED7521A00A2559D /* discription.dox */,
		C77707F62F5F288B00BA4E87 /* kernel_optimization.dox */,
		C729B8B22ED7536B00A2559D /* use_cases.dox */,
		C729B8B42ED77CA800A2559D /* code_performance.dox */,
		);