diff --git a/.gitignore b/.gitignore
index 30944692f3e9965ae406199abc5a3a2bca9196da..0b22c144133191a0f984e9cd410c92f9cfad0f0b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,6 @@
 .DS_Store
 build
+LLVM
+graph_framework.xcodeproj/project.xcworkspace/xcuserdata/
+graph_framework.xcodeproj/xcshareddata/xcschemes/graph_playground.xcscheme
+graph_framework.xcodeproj/xcuserdata
diff --git a/graph_docs/kernel_optimization.dox b/graph_docs/kernel_optimization.dox
new file mode 100644
index 0000000000000000000000000000000000000000..971619a7be1fb08e3c85361d7a305f121ba80345
--- /dev/null
+++ b/graph_docs/kernel_optimization.dox
@@ -0,0 +1,141 @@
+/*!
+ * @page kernel_optimization Kernel Optimization
+ * @brief Notes on optimizing kernel performance.
+ * @tableofcontents
+ * @section kernel_optimization_introduction Introduction
+ * There is a balance kernel run time, compile time, and kernel source code
+ * size. To discuss these issues we can explore a case study for a simple field
+ * solve of a Particle In Cell (PIC) code. The field solve involves computing
+ * the contribution of a every particle to fields. Fields are typically
+ * discretized onto a grid which will have a smaller size compared to the number
+ * of particles.
+ *
+ * @section kernel_optimization_source Source Code
+ * Take the example case.
+ * @code
+template<jit::float_scalar T>
+void field_solve_example() {
+    timing::measure_diagnostic init("Init");
+    const size_t batch = 1;
+    const size_t num_particles = 1000000;
+    auto particle_positions = graph::variable<T> (num_particles, "x");
+
+    const size_t num_grid = 1000;
+    auto grid_value = graph::variable<T> (num_grid, "g");
+    auto particle_index = graph::variable<T> (num_grid, "i");
+    auto gird_position = graph::variable<T> (num_grid, "gx");
+
+    std::vector<T> buffer(1000, static_cast<T> (0.0));
+    grid_value->set(buffer);
+    particle_index->set(buffer);
+
+    for (size_t index = 0; index < num_grid; index++) {
+        buffer[index] = static_cast<T> (2*index)/static_cast<T> (999)
+                      - static_cast<T> (1);
+    }
+    gx->set(buffer);
+    
+    auto indexed_particle = graph::index_1D(particle_positions,
+                                            particle_index,
+                                            static_cast<T> (1),
+                                            static_cast<T> (0));
+    auto next_index = particle_index + static_cast<T> (1.0);
+    auto arg = indexed_particle - gird_position;
+    auto next_grid_value = grid_value
+                         + graph::exp(static_cast<T> (-1)*arg*arg/static_cast<T> (10));
+
+//  Unroll kernel call loop by running multiple iterations in a single kernel
+//  call.
+    for (size_t i = 1; i < batch; i++) {
+        indexed_particle = graph::index_1D(particle_positions,
+                                           particle_index,
+                                           static_cast<T> (1),
+                                           static_cast<T> (0));
+        next_index = next_index + static_cast<T> (1.0);
+        arg = indexed_particle - gird_position;
+        next_grid_value = next_grid_value
+                        + graph::exp(static_cast<T> (-1)*arg*arg/static_cast<T> (10));
+    }
+
+    auto state = graph::random_state<T> (jit::context<T>::random_state_size, 0);
+    auto random = graph::random<T> (graph::random_state_cast(state));
+    const T max = 1.0;
+    const T min = -1.0;
+    auto random_real = (max - min)/graph::random_scale<T> ()*random + min;
+
+    init.print();
+
+    timing::measure_diagnostic compile("compile");
+
+    workflow::manager<T> work(0);
+    work.add_preitem({
+        graph::variable_cast(particle_positions)
+    }, {}, {
+        {random_real, variable_cast(particle_positions)}
+    }, graph::random_state_cast(state), "Initialize_x", num_particles);
+
+    work.add_item({
+        graph::variable_cast(particle_positions),
+        graph::variable_cast(indexed_particle),
+        graph::variable_cast(grid_value),
+        graph::variable_cast(gird_position)
+    }, {}, {
+        {next_index, graph::variable_cast(indexed_particle)},
+        {next_grid_value, graph::variable_cast(grid_value)}
+    }, NULL, "Set_Grid", num_grid);
+
+    work.compile();
+
+    compile.print();
+
+    work.pre_run();
+    timing::measure_diagnostic field_solve("field_solve");
+    for (size_t index = 0, ie = num_particles/batch; index < ie; index++) {
+        work.run();
+    }
+    work.wait();
+    field_solve.print();
+}
+ @endcode
+ *
+ * In this example we build two kernels. The <tt>Initialize_x</tt> kernel set
+ * random values between @f$1@f$ and @f$-1@f$ for particle positions to
+ * represent a particle state after a particle push step. This kernel is applied
+ * over each of the @f$1\times10^{6}@f$ particles.
+ *
+ * The <tt>Set_Grid</tt> kernel mimics a simple field. For a given particle
+ * position @f$x@f$, the contribution to grid point is defined as
+ * @f{equation}{g_{next}=g+e^{-\frac{\left(x\left[i\right]-gx\right)^{2}}{10}}@f}
+ * where @f$i@f$ is the particle index, @f$g@f$ is the current grid value,
+ * @f$gx@f$ is the grid position, and @f$g_{next}@f$ is the updated value. After
+ * each kernel call, the particle index is incremented. This kernel is applied
+ * over each of the @f$1\times10^{3}@f$ gird points.
+ *
+ * @section kernel_optimization_performance Kernel Performance
+ * @image{} html kernel_optimization.png "Wall time scaling comparing compile time and run time for the number of loop unroll iterations."
+ * In this section we focus on the <tt>Set_Grid</tt> kernel and the
+ * <tt>batch</tt>. When <tt>batch = 1</tt>, the kernel only adds the kernel
+ * contribution for a single particle. To compute the full field, we would need
+ * to call this kernel @f$1\times10^{6}@f$. Setting <tt>batch > 1</tt>
+ * allows us to unroll the kernel call loop. This reduces the number of times we
+ * need to call the kernel but comes at the expense of a larger kernel and an
+ * associated compile penalty. If we were to set <tt>batch = 1000000</tt>
+ * then we would fully unroll the loop resulting in only needing to call the
+ * kernel once. However it would come at the expense of the generated kernel
+ * source having over @f$1\times10^{6}@f$ lines of code. In testing, loop
+ * unrolling beyond @f$1\times10^{5}@f$ resulted in a segmentation fault.
+ *
+ * The figure above, shows the scaling of the setup time for the graphs, the
+ * compile time for the kernel and the resulting run time. A benchmark was
+ * performed on a Apple M1 Pro for both float and double precision. Note that
+ * the GPU on the M1 Pro only support single precision. CPU benchmark was
+ * performed on a single core.
+ *
+ * Unrolling the loop up to @f$500@f$ iterations shows a significant reduction
+ * in the GPU kernel run time with a negligible affect on the initialization and
+ * kernel compile time. Beyond @f$1000@f$ iterations, the improvement to kernel
+ * runtime becomes negligible. However, any gaines we made in runtime
+ * improvement is lost due to penalty to kernel compiling. For CPU execution
+ * there is little to no noticeable performance improvement when unrolling
+ * kernel call loops.
+ */
diff --git a/graph_docs/kernel_optimization.png b/graph_docs/kernel_optimization.png
new file mode 100644
index 0000000000000000000000000000000000000000..03172cd05dcaa01a040443c18fd182065a267f3c
Binary files /dev/null and b/graph_docs/kernel_optimization.png differ
diff --git a/graph_docs/main.dox b/graph_docs/main.dox
index d7eda1d705bfe77eb899b8f8ddeec217275c401b..00ee75180430f6ba5f01905b02893628ec7a1733 100644
--- a/graph_docs/main.dox
+++ b/graph_docs/main.dox
@@ -47,6 +47,7 @@
  * * @ref tutorial "Tutorial"
  * * @ref graph_c_binding
  * * @ref graph_fortran_binding
+ * * @ref kernel_optimization
  *
  * <hr>
  * @section framework_developer Framework developer guides
diff --git a/graph_framework.xcodeproj/project.pbxproj b/graph_framework.xcodeproj/project.pbxproj
index 57c3d1388019a03f8b089d0a41609c86413408c7..5558843e59ea8685f3d61195e197d8ce13dcbe7b 100644
--- a/graph_framework.xcodeproj/project.pbxproj
+++ b/graph_framework.xcodeproj/project.pbxproj
@@ -364,6 +364,7 @@
 		C75C42952E5CC80B00B0950B /* tutorial.dox */ = {isa = PBXFileReference; lastKnownFileType = text; path = tutorial.dox; sourceTree = "<group>"; };
 		C760B1AB2BC6D760001737A3 /* get_includes.py */ = {isa = PBXFileReference; lastKnownFileType = text.script.python; path = get_includes.py; sourceTree = "<group>"; };
 		C7678FBD2B45C2850025F37E /* bin.py */ = {isa = PBXFileReference; lastKnownFileType = text.script.python; path = bin.py; sourceTree = "<group>"; };
+		C77707F62F5F288B00BA4E87 /* kernel_optimization.dox */ = {isa = PBXFileReference; lastKnownFileType = text; path = kernel_optimization.dox; sourceTree = "<group>"; };
 		C77E6DF522DD64E700469621 /* trigonometry.hpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.h; path = trigonometry.hpp; sourceTree = "<group>"; };
 		C78F3D872DC122B1002E3D94 /* CMakeLists.txt */ = {isa = PBXFileReference; lastKnownFileType = text; path = CMakeLists.txt; sourceTree = "<group>"; };
 		C78F3D882DC122B1002E3D94 /* xkorc.cpp */ = {isa = PBXFileReference; explicitFileType = sourcecode.cpp.objcpp; path = xkorc.cpp; sourceTree = "<group>"; };
@@ -627,6 +628,7 @@
 				C75C42952E5CC80B00B0950B /* tutorial.dox */,
 				C7DD87D32E664B440058BA66 /* code_structure.dox */,
 				C729B8B12ED7521A00A2559D /* discription.dox */,
+				C77707F62F5F288B00BA4E87 /* kernel_optimization.dox */,
 				C729B8B22ED7536B00A2559D /* use_cases.dox */,
 				C729B8B42ED77CA800A2559D /* code_performance.dox */,
 			);
diff --git a/graph_framework/cuda_context.hpp b/graph_framework/cuda_context.hpp
index dc660a09e2ea5b6a0cd02787028a746abeb8c34c..741bafa234a32a13e1861d4c3d025a113af8659f 100644
--- a/graph_framework/cuda_context.hpp
+++ b/graph_framework/cuda_context.hpp
@@ -324,6 +324,7 @@ namespace gpu {
             check_error(cuModuleGetFunction(&function, module, kernel_name.c_str()), "cuModuleGetFunction");
 
             std::vector<void *> buffers;
+            std::set<graph::leaf_node<float, SAFE_MATH> *> needed_buffers;
 
             const size_t buffer_element_size = sizeof(T);
             for (auto &input : inputs) {
@@ -339,6 +340,11 @@ namespace gpu {
                                              backend.size()*sizeof(T)),
                                 "cuMemcpyHtoD");
                     buffers.push_back(reinterpret_cast<void *> (&kernel_arguments[input.get()]));
+                    needed_buffers.insert(input.get());
+                }
+                if (!needed_buffers.contains(input.get())) {
+                    buffers.push_back(kernel_arguments[input.get()]);
+                    needed_buffers.insert(input.get());
                 }
             }
             for (auto &output : outputs) {
@@ -349,6 +355,11 @@ namespace gpu {
                                                   CU_MEM_ATTACH_GLOBAL),
                                 "cuMemAllocManaged");
                     buffers.push_back(reinterpret_cast<void *> (&kernel_arguments[output.get()]));
+                    needed_buffers.insert(output.get());
+                }
+                if (!needed_buffers.contains(output.get())) {
+                    buffers.push_back(kernel_arguments[output.get()]);
+                    needed_buffers.insert(output.get());
                 }
             }
 
diff --git a/graph_framework/metal_context.hpp b/graph_framework/metal_context.hpp
index 5e0bc964c49e0917a32eac8b4d1768300db51f2e..fd91bee47a8392d931a9d19e891b7e56f0b389b2 100644
--- a/graph_framework/metal_context.hpp
+++ b/graph_framework/metal_context.hpp
@@ -138,6 +138,7 @@ namespace gpu {
             }
 
             std::vector<id<MTLBuffer>> buffers;
+            std::set<graph::leaf_node<float, SAFE_MATH> *> needed_buffers;
 
             const size_t buffer_element_size = sizeof(float);
             for (graph::shared_variable<float, SAFE_MATH> &input : inputs) {
@@ -147,6 +148,11 @@ namespace gpu {
                                                                         length:buffer.size()*buffer_element_size
                                                                        options:MTLResourceStorageModeShared];
                     buffers.push_back(kernel_arguments[input.get()]);
+                    needed_buffers.insert(input.get());
+                }
+                if (!needed_buffers.contains(input.get())) {
+                    buffers.push_back(kernel_arguments[input.get()]);
+                    needed_buffers.insert(input.get());
                 }
             }
             for (graph::shared_leaf<float, SAFE_MATH> &output : outputs) {
@@ -154,6 +160,11 @@ namespace gpu {
                     kernel_arguments[output.get()] = [device newBufferWithLength:num_rays*sizeof(float)
                                                                          options:MTLResourceStorageModeShared];
                     buffers.push_back(kernel_arguments[output.get()]);
+                    needed_buffers.insert(output.get());
+                }
+                if (!needed_buffers.contains(output.get())) {
+                    buffers.push_back(kernel_arguments[output.get()]);
+                    needed_buffers.insert(output.get());
                 }
             }
             if (state.get()) {