diff --git a/graph_docs/Comparison.png b/graph_docs/Comparison.png
index af33ffbff9ca889039a2f6d3897d648ef611f17b..293d5769b03acd3311744eea871c3943a960aa90 100644
Binary files a/graph_docs/Comparison.png and b/graph_docs/Comparison.png differ
diff --git a/graph_docs/code_performance.dox b/graph_docs/code_performance.dox
index 444574ae740f20aee347942267be8d867b918573..ac08f5bf44283eb146269e9e78db33e35ca5ec8d 100644
--- a/graph_docs/code_performance.dox
+++ b/graph_docs/code_performance.dox
@@ -45,12 +45,16 @@
  * @f{equation}{\frac{\partial\vec{v}}{\partial t} = dt\vec{v}\times\vec{B}@f}
  * @f{equation}{\frac{\partial\vec{x}}{\partial t} = dt\vec{v}@f}
  *
- * We compared the graph framework against the MLX framework since it supports
- * Apple GPUs and JAX due to it's popularity. Source codes for this benchmark
- * case is available in the appendix. Figure \ref{fig:compare} shows the through put of
- * pushing $10^{8}$ particles for $10^{3}$ time steps. The graph framework
- * consistently shows the best throughput on both CPUs and GPUs. Note MLX CPU
- * throughput could by improved by splitting the problem to multiple threads.
+ * We compared the graph framework against the
+ * <a href="https://ml-explore.github.io/mlx/build/html/index.html">MLX</a>
+ * framework since it supports Apple GPUs,
+ * <a href="https://docs.jax.dev/en/latest/">JAX</a> due to it's popularity,
+ * and <a href="https://kokkos.org">Kokkos</a> for its performance
+ * portability. Source codes for this benchmark case is available in the
+ * appendix. Figure \ref{fig:compare} shows the through put of pushing $10^{8}$
+ * particles for $10^{3}$ time steps. The graph framework consistently shows the
+ * best throughput on both CPUs and GPUs. Note MLX CPU throughput could by
+ * improved by splitting the problem to multiple threads.
  *
  * @subsection code_performance_comparison_codes Source codes for throughput benchmark comparison
  * @subsubsection code_performance_comparison_graph Graph Framework
@@ -93,7 +97,7 @@ for (size_t i = 0, ie = threads.size(); i < ie; i++) {
         auto v_next = v + dt*lorentz;
         auto pos_next = pos + dt*v_next;
             
-        workflow::manager<float> work(0);
+        workflow::manager<float> work(thread_number);
         work.add_item({
             graph::variable_cast(x),
             graph::variable_cast(y),
@@ -177,7 +181,7 @@ const auto total_time = end - start;
 def push(x, y, z, vx, vy, vz):
     dt = 0.000001
     vx_next = vx + dt*(vy*1 - vz*0)
-    vy_next = vy + dt*(vz*0 - vy*1)
+    vy_next = vy + dt*(vz*0 - vx*1)
     vz_next = vz + dt*(vx*0 - vy*0)
     return vx_next, vy_next, vz_next,
            x + dt*vx_next, y + dt*vy_next, z + dt*vz_next
@@ -201,6 +205,48 @@ jax.block_until_ready([x, y, z, vx, vy, vz])
 end = time.time()
 
 print(end - start)
+ @endcode
+ *
+ * @subsubsection code_performance_comparison_kokkos Kokkos
+ * @code
+const size_t size = 100000000;
+const size_t steps = 1000;
+
+using ViewVectorType = Kokkos::View<float *, Kokkos::SharedSpace>;
+ViewVectorType x("x", size);
+ViewVectorType y("y", size);
+ViewVectorType z("z", size);
+
+ViewVectorType vx("vx", size);
+ViewVectorType vy("vy", size);
+ViewVectorType vz("vz", size);
+
+Kokkos::parallel_for(size, KOKKOS_LAMBDA(const int64_t index) {
+    vx[index] = 1;
+    vz[index] = 1;
+});
+
+const std::chrono::high_resolution_clock::time_point start = std::chrono::high_resolution_clock::now();
+
+for (size_t i = 0; i < steps; i++) {
+    Kokkos::parallel_for(size, KOKKOS_LAMBDA(const int64_t index) {
+        const float dt = 0.000001;
+        const float vx_next = vx[index] + dt*(vy[index]*1 - vz[index]*0);
+        const float vy_next = vy[index] + dt*(vz[index]*0 - vx[index]*1);
+        const float vz_next = vz[index] + dt*(vx[index]*0 - vy[index]*0);
+        x[index] += dt*vx_next;
+        y[index] += dt*vy_next;
+        z[index] += dt*vz_next;
+        vx[index] = vx_next;
+        vy[index] = vy_next;
+        vz[index] = vz_next;
+    });
+}
+
+Kokkos::fence();
+
+std::chrono::high_resolution_clock::time_point end = std::chrono::high_resolution_clock::now();
+const auto total_time = end - start;
  @endcode
  */
 
diff --git a/graph_docs/discription.dox b/graph_docs/discription.dox
index 6cee61523c018e8802383080f648774b3d9cbc3d..b5017a11e545ae2346421cc249805b987d68a5d3 100644
--- a/graph_docs/discription.dox
+++ b/graph_docs/discription.dox
@@ -5,8 +5,8 @@
  * @section discription_introduction Introduction
  * The basic functionality of this framework is to build expression graphs
  * representing mathematical equations. Reduce those graphs to simpler forms.
- * Transform those graph to take derivatives. Just-In-Time (JIT) compile them to
- * available compute device kernels. Then run those kernels in workflow. The
+ * Transform those graphs to take derivatives. Just-In-Time (JIT) compile them
+ * to available compute device kernels. Then run those kernels in workflows. The
  * code is written in using C++23 features. To simplify embedding into legacy
  * codes, there are additional language bindings for C and Fortran.
  *
@@ -48,9 +48,10 @@
  * be reduced to a single constant by calling the evaluate method. Sub-graph
  * expressions are combined, factored out, or moved to enable better reductions
  * on subsequent passes. As new ways of reducing the graph are implemented,
- * current and existing code built using this framework benefit from improved
- * speed. The figure above shows a visualization of the tree data structure for
- * the equation of a line, the derivative, and the subsequent reductions.
+ * current and existing code built using this framework will benefit from
+ * improved speed. The figure above shows a visualization of the tree data
+ * structure for the equation of a line, the derivative, and the subsequent
+ * reductions.
  *
  * @subsubsection discription_graphs_builds Building Graphs
  * As an example building an expression of line @f$y=mx+b@f$ accomplished by
@@ -79,8 +80,8 @@ auto dydmx = y->df(0.5*x);
  * running them in order. One @ref workflow::manager is created for each device
  * or thread. The user is responsible for creating threads. Each kernel is
  * generated through a @ref workflow::work_item. A work item is defined by
- * kernel @ref graph::input_nodes, @ref graph::output_nodes and
- * @ref graph::map_nodes. Map items are used to take the results of kernel and
+ * kernel @ref graph::input_nodes, @ref graph::output_nodes, and
+ * @ref graph::map_nodes. Map items are used to take the results of a kernel and
  * update an input buffer. Using our example of line equation, we can create a
  * workflow to compute @f$y@f$ and @f$\frac{\partial y}{\partial x}@f$.
  * @code
@@ -99,7 +100,7 @@ work.add_item({
  * elements in the inputs. Multiple work items can be created and will be
  * executed in order of creation.
  *
- * Once the work items are defined that can be JIT compiled to a backend device.
+ * Once the work items are defined they can be JIT compiled to a backend device.
  * The graph framework supports back ends for generic CPUs, Apple Metal GPUs,
  * Nvidia Cuda GPUs, and initial HIP support of AMD GPUs. Each back end supplies
  * relevant driver code to build the kernel source, compile the kernel, build
diff --git a/graph_docs/general.dox b/graph_docs/general.dox
index 40b41f5e19916b6d8a3ab27e068afb368bb7b7b3..0034b55116c7c6fba124323e3f4b38f8e34548b8 100644
--- a/graph_docs/general.dox
+++ b/graph_docs/general.dox
@@ -39,7 +39,7 @@
  * as either variables @f$x@f$ or constants @f$m,b@f$. These nodes are connected
  * by nodes for multiply and addition operations. The output @f$y@f$ represents
  * the entire graph of operations.
- * @image{} html line_graph.png "The graph structure for @f$y=mx+b@f$."
+ * @image{} html line_graph.png "The graph structure for y = mx + b."
  * Evaluation of graphs start from the top most node in this case the @f$+@f$
  * operation. Evaluation of a node is not performed until all sub-nodes are
  * evaluated starting with the left operand. Evaluation starts by recursively
@@ -58,9 +58,10 @@
  * graphs of a function derivative. For an example of taking derivatives see the
  * @ref tutorial_derivatives "auto differentiation tutorial". Lets say that we
  * want to take the derivative of @f$\frac{\partial y}{\partial x}@f$. This is
- * achieved by evaluating the until bottom left most node is reached. Then a new
- * graph is build starting with @f$\frac{\partial m}{\partial x}=0@f$. Applying
- * the first half of the chain rule we build a new graph for @f$0x@f$
+ * achieved by evaluating the graph until the bottom left most node is reached.
+ * Then a new graph is constucted starting with
+ * @f$\frac{\partial m}{\partial x}=0@f$. Applying the first half of the chain
+ * rule we build a new graph for @f$0x@f$
  * @image{} html line_graph_dydf1.png ""
  * Then we take the derivative of the right operand and apply the second half
  * of the chain rule to build a new graph for @f$0x=0@f$.
@@ -73,8 +74,8 @@
  * The final expression for @f$\frac{\partial y}{\partial x}@f$ contains many
  * unnecessary nodes in the graph. Instead of building full graphs, we can
  * simplify and eliminate nodes as we build them. For instance, when the
- * expression @f$0x@f$ this created can be immediately reduce it  to a single
- * node.
+ * expression @f$0\times x@f$ is created, this can be immediately reduced to a
+ * single node @f$0@f$.
  * @image{} html line_graph_reduce1.png ""
  * Applying all possible reductions reduces the final expression to
  * @f$\frac{\partial y}{\partial x}=m@f$.
@@ -109,7 +110,7 @@
  * @subsection general_concepts_compile_maps Maps
  * Maps enable the results of an output node to be stored in an input node. This
  * is used for a wide varity of cases. For instance take a gradient decent step.
- * @f{equation}{y = y + \frac{\partial f}{\partial x}@f}
+ * @f{equation}{y_{i+1} = y_{i} + \frac{\partial f}{\partial x}@f}
  * In this case the output of the expression
  * @f$y + \frac{\partial f}{\partial x}@f$
  * can be mapped to update @f$y@f$.
@@ -122,7 +123,7 @@
  * <hr>
  * @section general_concepts_safe_math Safe Math
  * There are some conditions where mathematically, a graph should evaluate to a
- * normal number. However, when evaluated suing floating point precision, can
+ * normal number. However, when evaluated using floating point precision, can
  * lead to <tt>Inf</tt> or <tt>NaN</tt>. An example of this the
  * @f$\exp\left(x\right)@f$ function. For large argument values,
  * @f$\exp\left(x\right)@f$ overflows the maximum floating point precision and
diff --git a/graph_docs/tutorial.dox b/graph_docs/tutorial.dox
index a59329ff38301e991deb015f43927bc020b27052..d3aaca42b42037383a1ec2d525da5203689712f8 100644
--- a/graph_docs/tutorial.dox
+++ b/graph_docs/tutorial.dox
@@ -13,7 +13,7 @@
  * executable target which can be used to test out the API's of this framework.
  * The playground starts with a blank main function.
  * @code
-#include "../graph_framework/jit.hpp"
+#include "graph_framework.hpp"
 
 int main(int argc, const char * argv[]) {
     START_GPU
@@ -30,7 +30,7 @@ int main(int argc, const char * argv[]) {
  * main. This will allow us to play with different floating point types. For now
  * we will start with a simple float type.
  * @code
-#include "../graph_framework/jit.hpp"
+#include "graph_framework.hpp"
 
 template<jit::float_scalar T>
 void run_tutorial() {
@@ -84,16 +84,16 @@ void run_tutorial() {
  * so all method are called using the <tt>-></tt> operator.
  *
  * @subsection tutorial_constant Constant Nodes
- * Next we want to define a constant. There are two method to define constants
+ * Next we want to define a constant. There are two methods to define constants
  * explicitly or implicitly.
  * @code
 template<jit::float_scalar T>
 void run_tutorial() {
     auto x = graph::variable(1000, "x");
 
-// Define explicit constant.
+//  Define explicit constant.
     auto m = graph::constant<T> (0.4);
-// Define implicit constant.
+//  Define implicit constant.
     const T b = 0.6;
 }
  @endcode
@@ -110,9 +110,9 @@ template<jit::float_scalar T>
 void run_tutorial() {
     auto x = graph::variable(1000, "x");
 
-// Define explicit constant.
+//  Define explicit constant.
     auto m = graph::constant<T> (0.4);
-// Define implicit constant.
+//  Define implicit constant.
     const T b = 0.6;
 
 //  Equation of a line
@@ -133,15 +133,15 @@ template<jit::float_scalar T>
 void run_tutorial() {
     auto x = graph::variable(1000, "x");
 
-// Define explicit constant.
+//  Define explicit constant.
     auto m = graph::constant<T> (0.4);
-// Define implicit constant.
+//  Define implicit constant.
     const T b = 0.6;
 
-// Equation of a line
+//  Equation of a line
     auto y = m*x + b;
 
-// Auto differentiation.
+//  Auto differentiation.
     auto dydx = y->df(x);
     dydx->to_latex();
     std::cout << std::endl;
@@ -168,18 +168,18 @@ template<jit::float_scalar T>
 void run_tutorial() {
     auto x = graph::variable(3, "x");
 
-// Define explicit constant.
+//  Define explicit constant.
     auto m = graph::constant<T> (0.4);
-// Define implicit constant.
+//  Define implicit constant.
     const T b = 0.6;
 
-// Equation of a line
+//  Equation of a line
     auto y = m*x + b;
 
-// Auto differentiation.
+//  Auto differentiation.
     auto dydx = y->df(x);
 
-// Create a workflow manager.
+//  Create a workflow manager.
     workflow::manager<T> work(0);
 }
  @endcode
@@ -322,13 +322,13 @@ void run_tutorial() {
     auto x = graph::variable<T> (3, "x");
     x->set({1.0, 2.0, 3.0});
 
-// Define an objective function.
+//  Define an objective function.
     auto f = 0.2*x*x*x + 0.6*x*x + 0.4*x + 0.5;
 
 //  Define a step update.
     auto x_new = x - f/f->df(x);
 
-// Create a workflow manager.
+//  Create a workflow manager.
     workflow::manager<T> work(0);
     work.add_item({
         graph::variable_cast(x)
@@ -372,7 +372,7 @@ void run_tutorial() {
  * a reduction on the host side and transferring the entire array to the host.
  * To improve this we can use a converge item instead.
  * @code
-// Create a workflow manager.
+//  Create a workflow manager.
 workflow::manager<T> work(0);
 work.add_converge_item({
     graph::variable_cast(x)
diff --git a/graph_docs/use_cases.dox b/graph_docs/use_cases.dox
index a2e7f530e35b6692f3262452f63c09f612e729c6..f47fa274899f10ca9e5d443faa528806d963924f 100644
--- a/graph_docs/use_cases.dox
+++ b/graph_docs/use_cases.dox
@@ -11,9 +11,9 @@
  * @subsection use_cases_rf RF Ray tracing
  * Geometric optics is a set of asymptotic approximation methods to solve wave
  * equations. The physics of the particular wave determines an algebraic
- * relation between $\omega$ and $\vec{k}$ called a dispersion relation,
- * @f$D\left(\omega,\vec{k}\right)=0@f$. Since the parameter $t$ does not appear
- * explicitly in the dispersion relation, the function
+ * relation between @f$\omega@f$ and @f$\vec{k}@f$ called a dispersion relation,
+ * @f$D\left(\omega,\vec{k}\right)=0@f$. Since the parameter @f$t@f$ does not
+ * appear explicitly in the dispersion relation, the function
  * @f$\omega\left(\vec{k}\left(t\right),\vec{x}\left(t\right)\right)@f$ is
  * constant along the ray trajectory
  * @f{equation}{\frac{\partial\omega}{\partial t}=\frac{\partial\omega}{\partial\vec{x}}\cdot\frac{\partial\vec{x}}{\partial t}+\frac{\partial\omega}{\partial\vec{k}}\cdot\frac{\partial\vec{k}}{\partial t}\equiv 0@f}
@@ -41,7 +41,7 @@
  * by relatively simple dispersion relations in plane stratified plasmas, that
  * is plasma with spatial variation only in the @f$x@f$ direction. In a
  * spatially varying medium, at a given frequency, there may be regions in which
- * the solution of the dispersion relation, $\vec{k}$, is real, and the wave
+ * the solution of the dispersion relation, @f$\vec{k}@f$, is real, and the wave
  * propagates. In other regions @f$\vec{k}@f$ is imaginary and the wave does not
  * propagate, referred to as evanescent. The boundary between a region of
  * propagation and evanescence is a surface called a cut-off. It is also
@@ -53,11 +53,11 @@
  * behavior, and the behavior of rays in their vicinity is an indication of the
  * correctness of the solution.
  *
- * For plasma, the spatial dependence of the dispersion relation comes through
+ * For plasmas, the spatial dependence of the dispersion relation comes through
  * variation of the plasma equilibrium quantities. These include the vector
  * magnetic field, @f$\vec{B}\left(x\right)@f$, the density of each plasma
  * particle species, @f$n_{s}\left(x\right)@f$, and the temperature of each
- * particle species, @f$T_{s}\left(x\right)@f$, where $s$ indicates a
+ * particle species, @f$T_{s}\left(x\right)@f$, where @f$s@f$ indicates a
  * particular species. For the cases presented here a linear gradient along the
  * @f$x@f$ direction is taken for either the particle density or magnetic field
  * strength.
diff --git a/graph_framework/cpu_context.hpp b/graph_framework/cpu_context.hpp
index f7d677555bc6c4efc79aab6fcafcb1058a92d74c..9e27c9d6aeb981bc951d74dbf50afca3f77b6175 100644
--- a/graph_framework/cpu_context.hpp
+++ b/graph_framework/cpu_context.hpp
@@ -116,7 +116,7 @@ namespace gpu {
 //------------------------------------------------------------------------------
 ///  @brief Construct a cpu context.
 ///
-///  @param[in] index Concurrent index. Not used.
+///  @param[in] index Device index. Not used.
 //------------------------------------------------------------------------------
         cpu_context(const size_t index) {
             llvm::InitializeNativeTarget();
diff --git a/graph_framework/cuda_context.hpp b/graph_framework/cuda_context.hpp
index ee02e0bfc43248ed9b006f268699e6d980d90a43..dc660a09e2ea5b6a0cd02787028a746abeb8c34c 100644
--- a/graph_framework/cuda_context.hpp
+++ b/graph_framework/cuda_context.hpp
@@ -134,7 +134,7 @@ namespace gpu {
 //------------------------------------------------------------------------------
 ///  @brief Cuda context constructor.
 ///
-///  @param[in] index Concurrent index.
+///  @param[in] index Device index.
 //------------------------------------------------------------------------------
         cuda_context(const size_t index) : result_buffer(0), module(0), offset_buffer(0) {
             check_error(cuDeviceGet(&device, index), "cuDeviceGet");
diff --git a/graph_framework/metal_context.hpp b/graph_framework/metal_context.hpp
index ae78312c2c90c7571289a4ee16f58a01f1a96491..5e0bc964c49e0917a32eac8b4d1768300db51f2e 100644
--- a/graph_framework/metal_context.hpp
+++ b/graph_framework/metal_context.hpp
@@ -65,7 +65,7 @@ namespace gpu {
 //------------------------------------------------------------------------------
 ///  @brief Construct a metal context.
 ///
-///  @param[in] index Concurrent index.
+///  @param[in] index Device index.
 //------------------------------------------------------------------------------
         metal_context(const size_t index) :
         device([MTLCopyAllDevices() objectAtIndex:index]),
diff --git a/graph_framework/node.hpp b/graph_framework/node.hpp
index ae729bd9da0539cfe26987604ee94f391043e5a9..9e675db406a1f43f142f5c66e289e6aa473c70f7 100644
--- a/graph_framework/node.hpp
+++ b/graph_framework/node.hpp
@@ -311,7 +311,7 @@
 ///  @code
 ///  virtual shared_leaf<T, SAFE_MATH> remove_pseudo() {
 ///      if (this->has_pseudo()) {
-///          return sqrt(this->arg->remove_pseudo());
+///          return foo(this->arg->remove_pseudo());
 ///      }
 ///      return this->shared_from_this();
 ///  }
diff --git a/graph_framework/workflow.hpp b/graph_framework/workflow.hpp
index 421442b79c8e1ef6f3ae47532ce4d65a40f13102..012892ca757eb0ae68ed8022ab70754abdc9351c 100644
--- a/graph_framework/workflow.hpp
+++ b/graph_framework/workflow.hpp
@@ -183,7 +183,13 @@ namespace workflow {
 //------------------------------------------------------------------------------
 ///  @brief Workflow manager constructor.
 ///
-///  @param[in] index Concurrent index.
+///  For GPU devices, this select the device number to run on. For CPU devices
+///  this parameter is ignored.
+///
+///  @note It is possible to create multiple workflow managers for the same
+///        GPU device and may have performance benefits todo so.
+///
+///  @param[in] index Device index.
 //------------------------------------------------------------------------------
         manager(const size_t index) : context(index), add_reduction(false) {}