diff --git a/README.md b/README.md
index 055dcd90cf8ede27ed312513a46fa735bc5571bc..fffa38eb970db40a394c2d43d26004a242d39e57 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,25 @@
 
 **graph_framework**: A graph computation framework that supports auto 
 differentiation. It is designed to allow domain scientists to create cross 
-platform GPU accelerated code and embed those codes in existing legacy tools.
+platform GPU accelerated code and embed those codes in existing legacy tools. It
+is designed for the domain of physics problems where there same physics is 
+applied to a large ensemble of independent systems.
+
+This framework enables:
+1. Portability to Nvidia, AMD, and Apple GPUs and CPUs.
+2. Abstraction of the physics from the compute.
+3. Auto Differentiation.
+4. Embedding in C, C++, and Fortran codes. 
+
+The compute kernels created have strong scaling to multiple devices
+
+![Strong Scaling](graph_docs/StrongScaling.png)
+
+and the best throughput on both GPUs and CPUs compared to other frameworks like
+[MLX](https://ml-explore.github.io/mlx/build/html/index.html) and 
+[JAX](https://docs.jax.dev/en/latest/)
+
+![Throughput](graph_docs/Comparison.png)
 
 [![Continuous Integration Test](https://github.com/ORNL-Fusion/graph_framework/actions/workflows/ci_test.yaml/badge.svg?branch=main)](https://github.com/ORNL-Fusion/graph_framework/actions/workflows/ci_test.yaml)
 
@@ -20,5 +38,5 @@ git clone https://github.com/ORNL-Fusion/graph_framework.git
 For instructions to build the code consult the 
 [build system](https://ornl-fusion.github.io/graph_framework-docs/build_system.html)
 documentation. This framework uses a [cmake](https://cmake.org) based build 
-system and requires the [NetCDF-C](https://www.unidata.ucar.edu/software/netcdf/)
-library.
+system and requires the 
+[NetCDF-C](https://www.unidata.ucar.edu/software/netcdf/) library.
diff --git a/graph_docs/Cold-Plasma-Result.png b/graph_docs/Cold-Plasma-Result.png
new file mode 100644
index 0000000000000000000000000000000000000000..0f183f8317d6260d0433e14b340179487eab7d90
Binary files /dev/null and b/graph_docs/Cold-Plasma-Result.png differ
diff --git a/graph_docs/Cold-Plasma-Result2.png b/graph_docs/Cold-Plasma-Result2.png
new file mode 100644
index 0000000000000000000000000000000000000000..ef3b7aaa7aa0e7a77b9d6fadbb1e3b493fb5a6cc
Binary files /dev/null and b/graph_docs/Cold-Plasma-Result2.png differ
diff --git a/graph_docs/Comparison.png b/graph_docs/Comparison.png
new file mode 100644
index 0000000000000000000000000000000000000000..af33ffbff9ca889039a2f6d3897d648ef611f17b
Binary files /dev/null and b/graph_docs/Comparison.png differ
diff --git a/graph_docs/GenrayComponents.png b/graph_docs/GenrayComponents.png
new file mode 100644
index 0000000000000000000000000000000000000000..1887b1dc22e6c8880471b57e38013a3f35c545dd
Binary files /dev/null and b/graph_docs/GenrayComponents.png differ
diff --git a/graph_docs/GenrayRays.png b/graph_docs/GenrayRays.png
new file mode 100644
index 0000000000000000000000000000000000000000..e41c488393f59d404b3313d00b00bbfff5270ddf
Binary files /dev/null and b/graph_docs/GenrayRays.png differ
diff --git a/graph_docs/ParticleTraces.png b/graph_docs/ParticleTraces.png
new file mode 100644
index 0000000000000000000000000000000000000000..df7c6e878fa416ab63804626ac3c10908e4724ef
Binary files /dev/null and b/graph_docs/ParticleTraces.png differ
diff --git a/graph_docs/StellaratorRays.png b/graph_docs/StellaratorRays.png
new file mode 100644
index 0000000000000000000000000000000000000000..69cbca119e1a74fd3f81ca75494cc48d58de08ad
Binary files /dev/null and b/graph_docs/StellaratorRays.png differ
diff --git a/graph_docs/StrongScaling.png b/graph_docs/StrongScaling.png
new file mode 100644
index 0000000000000000000000000000000000000000..e3e2d219f42c4b7332cc51e21011c30dbed506f3
Binary files /dev/null and b/graph_docs/StrongScaling.png differ
diff --git a/graph_docs/TokamakRays.png b/graph_docs/TokamakRays.png
new file mode 100644
index 0000000000000000000000000000000000000000..e455e67925bfe38c6d543ea63dd441f3d14705f2
Binary files /dev/null and b/graph_docs/TokamakRays.png differ
diff --git a/graph_docs/Tree.png b/graph_docs/Tree.png
new file mode 100644
index 0000000000000000000000000000000000000000..c7fd7ba589f894ad659ae8401229c3a1d1f8dd6a
Binary files /dev/null and b/graph_docs/Tree.png differ
diff --git a/graph_docs/code_performance.dox b/graph_docs/code_performance.dox
new file mode 100644
index 0000000000000000000000000000000000000000..444574ae740f20aee347942267be8d867b918573
--- /dev/null
+++ b/graph_docs/code_performance.dox
@@ -0,0 +1,206 @@
+/*!
+ * @page code_performance Code Performance
+ * @brief Benchmark cases for the graph framework.
+ * @tableofcontents
+ * @section code_performance_introduction Introduction
+ * In this section we will look at the peformance of codes deveoped using this
+ * framework.
+ *
+ * @section code_performance_scaling Strong Scaling
+ * @image html StrongScaling.png "Left: Strong scaling wall time for 100000 Rays traced in a realistic tokamak equilibrium. Right: Strong scaling speedup normalized to the wall time for a single device or core. The dashed diagonal line references the best possible scaling. The M2 Max has 8 fast performance cores and 4 slower energy efficiency cores resulting drop off in improvement beyond 8 cores."
+ * To benchmark code performance we traced @f$10^{6}@f$ rays for @f$10^{3}@f$
+ * timesteps using the cold plasma dispersion relation in a realistic tokamak
+ * equilibrium. The figure above shows the strong scaling of wall time as the
+ * number of GPU and CPU devices are increased. The figure above shows the
+ * strong scaling speed up
+ * @f{equation}{SpeedUp = \frac{time\left(1\right)}{time\left(n\right)}@f}
+ * Benchmarking was prepared on two different setups.
+ *
+ * The first set up as a Mac Studio with an Apple M2 Max chip. The M2 chip
+ * contains a 12 core CPU where 8 cores are faster performance codes and the
+ * remaining 4 are slower efficiency cores. The M2 Max also contains a single
+ * 38-core GPU which only support single precision operations. The second setup
+ * is a server with 4 Nvidia A100 GPUs. Benchmarking measures the time to trace
+ * @f$10^{6}@f$ rays but does not include the setup and JIT times.
+ *
+ * The figure above shows the advantage even a single GPU has over CPU
+ * execution. In single precision, the M2's GPU is almost @f$100\times@f$ faster
+ * a single CPU core while the a single A100 has a nearly $800\times$ advantage.
+ * An interesting thing to note is the M2 Max CPU show no advantage between
+ * single and double precision execution.
+ *
+ * For large problem sizes the framework is expected to show good scaling with
+ * number of devices as the problems we are applying are embarrassingly parallel
+ * in nature. The figure above shows the strong scaling speed up with the number
+ * of devices. The framework shows good strong scaling as the problem is split
+ * among more devices. The architecture of the M2 Chip contains 8 fast
+ * performance cores and 4 slower energy efficiency cores. This produces a
+ * noticeable knee in the scaling after 8 core are used. Overall, the framework
+ * demonstrates good scaling across CPU and GPU devices.
+ *
+ * @section code_performance_comparison Comparison to other Frameworks
+ * @image html Comparison.png "Particle throughput for graph framework compared to MLX and JAX."
+ * To benchmark against other frameworks we will look at the simple case of gyro
+ * motion in a uniform magnetic field @f$\vec{B}=B_{0}\hat{z}@f$.
+ * @f{equation}{\frac{\partial\vec{v}}{\partial t} = dt\vec{v}\times\vec{B}@f}
+ * @f{equation}{\frac{\partial\vec{x}}{\partial t} = dt\vec{v}@f}
+ *
+ * We compared the graph framework against the MLX framework since it supports
+ * Apple GPUs and JAX due to it's popularity. Source codes for this benchmark
+ * case is available in the appendix. Figure \ref{fig:compare} shows the through put of
+ * pushing $10^{8}$ particles for $10^{3}$ time steps. The graph framework
+ * consistently shows the best throughput on both CPUs and GPUs. Note MLX CPU
+ * throughput could by improved by splitting the problem to multiple threads.
+ *
+ * @subsection code_performance_comparison_codes Source codes for throughput benchmark comparison
+ * @subsubsection code_performance_comparison_graph Graph Framework
+ * @code
+const size_t size = 100000000;
+const size_t steps = 1000;
+
+const unsigned int num_devices = jit::context<float>::max_concurrency();
+std::vector<std::thread>
+    threads(std::max(std::min(num_devices),
+                     static_cast<unsigned int> (size)),
+            static_cast<unsigned int> (1)));
+
+const size_t batch = size/threads.size();
+const size_t extra = size%threads.size();
+
+timing::measure_diagnostic_threaded time_steps("Time Steps");
+
+for (size_t i = 0, ie = threads.size(); i < ie; i++) {
+    threads[i] =
+        std::thread([batch, extra,
+                     &time_steps] (const size_t thread_number) -> void {
+        const size_t local_size = batch + (extra > thread_number ? 1 : 0);
+
+        auto x = graph::variable<float> (local_size, 0.0, "x");
+        auto y = graph::variable<float> (local_size, 0.0, "y");
+        auto z = graph::variable<float> (local_size, 0.0, "z");
+            
+        auto vx = graph::variable<float> (local_size, 1.0, "vx");
+        auto vy = graph::variable<float> (local_size, 0.0, "vy");
+        auto vz = graph::variable<float> (local_size, 1.0, "vz");
+            
+        auto b = graph::vector<float> (0.0, 0.0, 1.0);
+        auto v = graph::vector(vx, vy, vz);
+        auto pos = graph::vector(x, y, z);
+            
+        auto lorentz = v->cross(b);
+        auto dt = graph::constant<float> (0.000001);
+            
+        auto v_next = v + dt*lorentz;
+        auto pos_next = pos + dt*v_next;
+            
+        workflow::manager<float> work(0);
+        work.add_item({
+            graph::variable_cast(x),
+            graph::variable_cast(y),
+            graph::variable_cast(z),
+            graph::variable_cast(vx),
+            graph::variable_cast(vy),
+            graph::variable_cast(vz)
+        }, {}, {
+            {pos_next->get_x(), graph::variable_cast(x)},
+            {pos_next->get_y(), graph::variable_cast(y)},
+            {pos_next->get_z(), graph::variable_cast(z)},
+            {v_next->get_x(), graph::variable_cast(vx)},
+            {v_next->get_y(), graph::variable_cast(vy)},
+            {v_next->get_z(), graph::variable_cast(vz)}
+        }, NULL, "Lorentz_kernel", local_size);
+        work.compile();
+            
+        time_steps.start_time(thread_number);
+        for (size_t j = 0; j < steps; j++) {
+            work.run();
+        }
+        work.wait();
+        time_steps.end_time(thread_number);
+    }, i);
+}
+
+for (std::thread &t : threads) {
+    t.join();
+}
+
+time_steps.print_max();
+ @endcode
+ *
+ * @subsubsection code_performance_comparison_mlx MLX
+ * @code
+typedef const std::vector<mlx::core::array> &inputs;
+typedef std::vector<mlx::core::array> outputs;
+typedef std::function<outputs(inputs)> function;
+
+mlx::core::set_default_device(mlx::core::Device::gpu);
+
+function push = mlx::core::compile([](inputs in) -> outputs {
+    const float dt = 0.000001;
+    const mlx::core::array zero = mlx::core::zeros({1});
+    const mlx::core::array one = mlx::core::zeros({1});
+    const mlx::core::array vx_next = in[3] + dt*(in[4]*one - in[5]*zero);
+    const mlx::core::array vy_next = in[4] + dt*(in[5]*zero - in[3]*one);
+    const mlx::core::array vz_next = in[5] + dt*(in[3]*zero - in[4]*zero);
+    const mlx::core::array x_next = in[0] + dt*vx_next;
+    const mlx::core::array y_next = in[1] + dt*vy_next;
+    const mlx::core::array z_next = in[2] + dt*vz_next;
+    return {x_next, y_next, z_next, vx_next, vy_next, vz_next};
+});
+
+const int size = 100000000;
+const int steps = 1000;
+
+mlx::core::array x = mlx::core::zeros({size});
+mlx::core::array y = mlx::core::zeros({size});
+mlx::core::array z = mlx::core::zeros({size});
+mlx::core::array vx = mlx::core::ones({size});
+mlx::core::array vy = mlx::core::zeros({size});
+mlx::core::array vz = mlx::core::ones({size});
+outputs in = {x, y, z, vx, vy, vz};
+
+const std::chrono::high_resolution_clock::time_point start = std::chrono::high_resolution_clock::now();
+
+for(size_t i = 0; i < steps; i++) {
+    in = push(in);
+    for (mlx::core::array &i : in) {
+        i.eval();
+    }
+}
+
+std::chrono::high_resolution_clock::time_point end = std::chrono::high_resolution_clock::now();
+const auto total_time = end - start;
+ @endcode
+ *
+ * @subsubsection code_performance_comparison_jaxx JAX
+ * @code
+def push(x, y, z, vx, vy, vz):
+    dt = 0.000001
+    vx_next = vx + dt*(vy*1 - vz*0)
+    vy_next = vy + dt*(vz*0 - vy*1)
+    vz_next = vz + dt*(vx*0 - vy*0)
+    return vx_next, vy_next, vz_next,
+           x + dt*vx_next, y + dt*vy_next, z + dt*vz_next
+
+push_jit = jax.jit(push)
+
+size = 100000000
+steps = 1000
+
+x = jax.numpy.zeros((size))
+y = jax.numpy.zeros((size))
+z = jax.numpy.zeros((size))
+vx = jax.numpy.ones((size))
+vy = jax.numpy.zeros((size))
+vz = jax.numpy.ones((size))
+
+start = time.time()
+for i in range(0, steps):
+    x, y, z, vx, vy, vz = push_jit(x, y, z, vx, vy, vz)
+jax.block_until_ready([x, y, z, vx, vy, vz])
+end = time.time()
+
+print(end - start)
+ @endcode
+ */
+
diff --git a/graph_docs/discription.dox b/graph_docs/discription.dox
new file mode 100644
index 0000000000000000000000000000000000000000..6cee61523c018e8802383080f648774b3d9cbc3d
--- /dev/null
+++ b/graph_docs/discription.dox
@@ -0,0 +1,132 @@
+/*!
+ * @page discription Discription
+ * @brief A discription of the basic functions of the graph_framework.
+ * @tableofcontents
+ * @section discription_introduction Introduction
+ * The basic functionality of this framework is to build expression graphs
+ * representing mathematical equations. Reduce those graphs to simpler forms.
+ * Transform those graph to take derivatives. Just-In-Time (JIT) compile them to
+ * available compute device kernels. Then run those kernels in workflow. The
+ * code is written in using C++23 features. To simplify embedding into legacy
+ * codes, there are additional language bindings for C and Fortran.
+ *
+ * @subsection discription_graphs Graphs
+ * @image html Tree.png "Mathematical operations are defined as a tree of operations. A df method transforms the tree by applying the derivative chain rule to each node. A reduce method applies algebraic rules removing nodes from the graph."
+ * The foundation of this framework is build around a tree data structure that
+ * enables the symbolic evaluation of mathematical expressions. The @ref graph
+ * namespace contains classes which symbolically represent mathematical
+ * operations and symbols. Each node of the graph is defined as a class derived
+ * from a @ref graph::leaf_node base class. The @ref graph::leaf_node class
+ * defines method to @ref graph::leaf_node::evaluate,
+ * @ref graph::leaf_node::reduce, @ref graph::leaf_node::df,
+ * @ref graph::leaf_node::compile, and methods for introspection. A feature
+ * unique to this framework is the expression trees can be rendered to
+ * @f$\LaTeX@f$ allowing a domain physicist to understand the results of
+ * reductions and transformations. This can also be used to identify future
+ * reduction opportunities.
+ *
+ * An important distinction of this framework compared to other auto
+ * differentiation frameworks is there is no distinction between nodes
+ * representing operations and nodes representing values. Sub-classes of
+ * @ref graph::leaf_node include nodes for constants, variables, arithmetic,
+ * basic math functions, and trigonometry functions. Other nodes encapsulate
+ * more complex expressions like piecewise constants which depend on the
+ * evaluation of an argument. These piecewise constants are used implement
+ * spline interpolation expressions.
+ *
+ * Each node is constructed via factory methods. For common arithmetic
+ * operations, the framework overloads the +-\*\/ operators to construct
+ * expression nodes. The factory method checks a node_cache to avoid building
+ * duplicate sub-graphs. Identification of duplicate graphs is performed by
+ * computing a hash of the sub-graph. This hash can be rapidly checked if the
+ * same hash already exists in a <tt>std::map</tt> container. If the sub-graph
+ * already exists, the existing graph is returned otherwise a new sub-graph is
+ * registered in the node_cache.
+ *
+ * Each time an expression is built, the reduce method is called to simplify the
+ * graph. For instance, a graph consisting of constant added to a constant will
+ * be reduced to a single constant by calling the evaluate method. Sub-graph
+ * expressions are combined, factored out, or moved to enable better reductions
+ * on subsequent passes. As new ways of reducing the graph are implemented,
+ * current and existing code built using this framework benefit from improved
+ * speed. The figure above shows a visualization of the tree data structure for
+ * the equation of a line, the derivative, and the subsequent reductions.
+ *
+ * @subsubsection discription_graphs_builds Building Graphs
+ * As an example building an expression of line @f$y=mx+b@f$ accomplished by
+ * creating a @ref graph::variable_node then applying operations on that node.
+ * @code
+auto x = graph::variable<float> (10, "x");
+auto y = 0.5*x + 0.1;
+ @endcode
+ * In this example, we have created a @ref graph::variable_node with the symbol
+ *  @f$x@f$ containing 10 elements. Then built the expression tree for @f$y@f$.
+ * Derivatives are taken using the @ref graph::leaf_node::df method.
+ * @code
+auto dydx = y->df(x);
+ @endcode
+ * Reductions are performed transparently as expressions are created so the
+ * expression for @f$\frac{\partial y}{\partial x}=0.5@f$. As noted before,
+ * since this framework makes no distinction between the various parts of a
+ * graph, derivatives and also be taken with respect to sub-expressions.
+ * @code
+auto dydmx = y->df(0.5*x);
+ @endcode
+ * In this case, the result will be @f$\frac{\partial y}{\partial 0.5*x}=1.0@f$
+ *
+ * @subsection discription_workflows
+ * A @ref workflow::manager is responsible for compiling device kernels, and
+ * running them in order. One @ref workflow::manager is created for each device
+ * or thread. The user is responsible for creating threads. Each kernel is
+ * generated through a @ref workflow::work_item. A work item is defined by
+ * kernel @ref graph::input_nodes, @ref graph::output_nodes and
+ * @ref graph::map_nodes. Map items are used to take the results of kernel and
+ * update an input buffer. Using our example of line equation, we can create a
+ * workflow to compute @f$y@f$ and @f$\frac{\partial y}{\partial x}@f$.
+ * @code
+workflow::manager<T> work(0);
+work.add_item({
+    graph::variable_cast(x)
+}, {
+    y,
+    dydx
+}, {}, NULL, "example_kernel", 10);
+ @endcode
+ * Here we have defined a kernel called "example_kernel". It has one input
+ * @f$x@f$, two outputs @f$y@f$ and @f$\frac{\partial y}{\partial x}@f$, and no
+ * maps. The <tt>NULL</tt> argument signifies there is no
+ * @ref graph::random_state used. The last argument needs to match the number of
+ * elements in the inputs. Multiple work items can be created and will be
+ * executed in order of creation.
+ *
+ * Once the work items are defined that can be JIT compiled to a backend device.
+ * The graph framework supports back ends for generic CPUs, Apple Metal GPUs,
+ * Nvidia Cuda GPUs, and initial HIP support of AMD GPUs. Each back end supplies
+ * relevant driver code to build the kernel source, compile the kernel, build
+ * device data buffers, and handle data synchronization between the device and
+ * host. All JIT operations are hidden behind a generic @ref jit::context
+ * interface.
+ *
+ * Each context, creates a specific kernel preamble and post-fix to build the
+ * correct syntax. Memory access is controlled by loading memory once in the
+ * beginning, and storing the results once at the end. Kernel source code is
+ * built by recursively traversing the output nodes and calling the
+ * @ref graph::leaf_node::compile method of each @ref graph::leaf_node. Each
+ * line of code is stored in a unique register variable assuming infinite
+ * registers. Duplicate code is eliminated by checking if a sub-graph has
+ * already been traversed. Once the kernel source code is built, the kernel
+ * library is compiled, and a kernel dispatch function is created using a
+ * C++ lambda function. The resulting workflow can be called multiple times.
+ * @code
+work.compile();
+for (size_t i = 0; i < 100; i++) {
+    work.run();
+}
+work.wait();
+ @endcode
+ * While this API is more explicit compared to the capabilities of JAX, PyTorch,
+ * TensorFlow, and MLX, it doesn't result in unexpected situations where graphs
+ * are being rebuilt and the user can trust when evaluation is finished.
+ * Additionally device buffers are only created for kernel inputs and outputs
+ * allowing the user to explicitly control memory usage.
+ */
diff --git a/graph_docs/general.dox b/graph_docs/general.dox
index 78dc5299bb58f1c913e6033e75940fbd814ec28c..40b41f5e19916b6d8a3ab27e068afb368bb7b7b3 100644
--- a/graph_docs/general.dox
+++ b/graph_docs/general.dox
@@ -39,7 +39,7 @@
  * as either variables @f$x@f$ or constants @f$m,b@f$. These nodes are connected
  * by nodes for multiply and addition operations. The output @f$y@f$ represents
  * the entire graph of operations.
- * @image{} html line_graph.png "The graph structure for y = mx + b."
+ * @image{} html line_graph.png "The graph structure for @f$y=mx+b@f$."
  * Evaluation of graphs start from the top most node in this case the @f$+@f$
  * operation. Evaluation of a node is not performed until all sub-nodes are
  * evaluated starting with the left operand. Evaluation starts by recursively
diff --git a/graph_docs/main.dox b/graph_docs/main.dox
index 0490d41fd9f6fe8861d177e253ffeabaf1e39755..d7eda1d705bfe77eb899b8f8ddeec217275c401b 100644
--- a/graph_docs/main.dox
+++ b/graph_docs/main.dox
@@ -18,6 +18,13 @@
  * * Enable Auto Differentiation.
  * * Enable easy embedding in C, C++, and Fortran codes.
  *
+ * In this documentation we present a graph computation based framework which
+ * compiles physics equations to optimized kernels for execution on multiple GPU
+ * and CPU vendors.
+ * * @ref discription
+ * * @ref use_cases
+ * * @ref code_performance
+ *
  * <hr>
  * @section tools User guides for tools
  * @subsection rf_ray_tracing RF Ray tracing
diff --git a/graph_docs/use_cases.dox b/graph_docs/use_cases.dox
new file mode 100644
index 0000000000000000000000000000000000000000..a2e7f530e35b6692f3262452f63c09f612e729c6
--- /dev/null
+++ b/graph_docs/use_cases.dox
@@ -0,0 +1,166 @@
+/*!
+ * @page use_cases Use Cases
+ * @brief Examples of physics problems created with the graph_framework.
+ * @tableofcontents
+ * @section use_cases_introduction Introduction
+ * There are many problems in fusion energy where the same equation needs to be
+ * applied to a large ensemble. This paper will highlight two examples using the
+ * graph framework. The first is an RF ray tracing problem to determine plasma
+ * heating. The second example is for particle pushing.
+ *
+ * @subsection use_cases_rf RF Ray tracing
+ * Geometric optics is a set of asymptotic approximation methods to solve wave
+ * equations. The physics of the particular wave determines an algebraic
+ * relation between $\omega$ and $\vec{k}$ called a dispersion relation,
+ * @f$D\left(\omega,\vec{k}\right)=0@f$. Since the parameter $t$ does not appear
+ * explicitly in the dispersion relation, the function
+ * @f$\omega\left(\vec{k}\left(t\right),\vec{x}\left(t\right)\right)@f$ is
+ * constant along the ray trajectory
+ * @f{equation}{\frac{\partial\omega}{\partial t}=\frac{\partial\omega}{\partial\vec{x}}\cdot\frac{\partial\vec{x}}{\partial t}+\frac{\partial\omega}{\partial\vec{k}}\cdot\frac{\partial\vec{k}}{\partial t}\equiv 0@f}
+ * by virtue of the ray equations. Since the dispersion relation is satisfied
+ * all along the ray trajectory, the derivatives needed for the ray equations
+ * can be obtained by implicit differentiation
+ * @f{equation}{\frac{\partial D}{\partial\vec{x}}=\frac{\partial D}{\partial\omega}\frac{\partial\omega}{\partial\vec{x}}\Rightarrow\frac{\partial\omega}{\partial\vec{x}}=-\frac{\frac{\partial D}{\partial\vec{x}}}{\frac{\partial D}{\partial\omega}}@f}
+ * @f{equation}{\frac{\partial D}{\partial\vec{k}}=\frac{\partial D}{\partial\omega}\frac{\partial\omega}{\partial\vec{k}}\Rightarrow\frac{\partial\omega}{\partial\vec{k}}=-\frac{\frac{\partial D}{\partial\vec{k}}}{\frac{\partial D}{\partial\omega}}@f}
+ * These equations are integrated to trace the ray.
+ *
+ * A ray tracing problem is build by implementing expressions for the plasma
+ * equilibrium. From the plasma equilibrium a dispersion relation is
+ * constructed. Equations of motion are defined using the auto differentiation.
+ * Expressions for ray update are built for in integrator. These expressions are
+ * JIT compiled into a single kernel call with inputs for @f$\vec{x}@f$,
+ * @f$\vec{k}@f$, @f$t@f$, and @f$\omega@f$ with outputs for the dispersion
+ * residual, and step updates for @f$\vec{x}@f$,
+ * @f$\vec{k}@f$ and @f$t@f$.
+ *
+ * @image html TokamakRays.png "100000 O-Mode rays traced in a realistic tokamak geometry."
+ * @image html StellaratorRays.png "10000 ray trajectories for a stellarator equilibrium using the same dispersion, and integrator method."
+ *
+ * @subsubsection use_cases_rf_correctness Correctness
+ * Here we illustrate the correctness of the code by examining waves described
+ * by relatively simple dispersion relations in plane stratified plasmas, that
+ * is plasma with spatial variation only in the @f$x@f$ direction. In a
+ * spatially varying medium, at a given frequency, there may be regions in which
+ * the solution of the dispersion relation, $\vec{k}$, is real, and the wave
+ * propagates. In other regions @f$\vec{k}@f$ is imaginary and the wave does not
+ * propagate, referred to as evanescent. The boundary between a region of
+ * propagation and evanescence is a surface called a cut-off. It is also
+ * possible that surfaces occur where @f$\vec{k}@f$ diverges to infinity, in
+ * which case the phase velocity component normal to the surface goes to zero.
+ * These are called resonances. Typically, the wave is reflected at a cut-off
+ * and is absorbed or converted to a different type of wave at a resonance.
+ * These critical surfaces, therefore, demark important changes in wave
+ * behavior, and the behavior of rays in their vicinity is an indication of the
+ * correctness of the solution.
+ *
+ * For plasma, the spatial dependence of the dispersion relation comes through
+ * variation of the plasma equilibrium quantities. These include the vector
+ * magnetic field, @f$\vec{B}\left(x\right)@f$, the density of each plasma
+ * particle species, @f$n_{s}\left(x\right)@f$, and the temperature of each
+ * particle species, @f$T_{s}\left(x\right)@f$, where $s$ indicates a
+ * particular species. For the cases presented here a linear gradient along the
+ * @f$x@f$ direction is taken for either the particle density or magnetic field
+ * strength.
+ * @f{equation}{f\left(x\right)=0.1x+1@f}
+ * Unless otherwise noted, the following table lists values for the plasma
+ * conditions.
+ * <table>
+ * <caption id="plasma_condition">Plasma Parameters</caption>
+ * <tr><th>Symbol        <th>Value
+ * <tr><td>@f$m_{i}@f$   <td>@f$3.344\times10^{-27}@f$ kg
+ * <tr><td>@f$n_{e0}@f$  <td>@f$1\times10^{19}@f$ m@f$^{-3}@f$
+ * <tr><td>@f$n_{i0}@f$  <td>@f$n_{e0}@f$
+ * <tr><td>@f$T_{e}@f$   <td>@f$1@f$ keV
+ * <tr><td>@f$T_{i}@f$   <td>@f$T_{e}@f$
+ * <tr><td>@f$\vec{B}@f$ <td>@f$1@f$ T@f$\hat{z}@f$
+ * </table>
+ *
+ * The examples presented employ the @ref dispersion_function_cold_plasma
+ * approximation. This means that the particle thermal speeds are much less than
+ * the wave phase velocity and the magnetic field is sufficiently strong that
+ * the particle gyro-radius is much less than the wavelength. The cold-plasma
+ * approximation greatly simplifies the dispersion relation and is actually
+ * valid for a wide range of wave propagation situations of fusion interest. A
+ * further approximation is made that the wave frequency is so high that only
+ * the electrons respond to the wave fields and the much heavier ions do not
+ * contribute to the wave-induced plasma current.
+ *
+ * Initial conditions for the wave solution are obtained by choosing fixed
+ * values for @f$\omega@f$, @f$\vec{x}@f$, @f$k_{y}@f$, and @f$k_{z}@f$. The
+ * remaining value @f$k_{x}@f$ is solved for using a Newton method to a
+ * tolerance of @f$\left|D\left(\vec{k},\omega\right)\right|<10^{-15}@f$. Since
+ * the dispersion functions are multi-valued, an initial guess for @f$k_{x}@f$
+ * selects among the possible roots.
+ *
+ * To mitigate the effects of floating point round-off errors, terms in the
+ * dispersion relation are combined to reduce the orders of magnitude for
+ * quantities. In plasmas, frequencies and waves typically trend in the
+ * @f$100@f$ GHz range. Folding the speed of light into the frequency,
+ * @f$\omega'=\frac{\omega}{c}@f$, results in quantities of @f$\omega'@f$ on the
+ * order of @f$1000@f$. The consequence is that this modified frequency
+ * @f$\omega'@f$ has units of m@f$^{-1}@f$. The phase,
+ * @f$v_{p}=\frac{\omega}{k}@f$, and group,
+ * @f$v_{g}=\frac{\partial\omega}{\partial k}@f$, velocities are unit less. As a
+ * consequence, time now contains a speed of light term @f$t'=tc@f$ with units
+ * of m. This slows time from values on the order of
+ * @f$1\times10^{-7}@f$ to @f$1@f$. Under this scheme, quantities of interest
+ * for ray tracing, position, @f$\vec{x}@f$, and wave number, @f$\vec{k}@f$,
+ * retain their normal units. The @ref dispersion_function_normal table,
+ * summarizes the changes resulting from these modified units. The following
+ * results are converted back to real units unless otherwise indicated.
+ * @image html ColdPlasma.png "The O-Mode branch can propagate through the quiescent region between the right-hand cutoff, ωr, and the upper hybrid resonance, ωh, that the X-Mode branch cannot but is cut off at the plasma frequency, ωpe. The X-Mode branch can pass the O-Mode's plasma cutoff but is stopped by the left-hand cutoff, ωl."
+ * The figure above shows the dispersion relation for a uniform magnetic field
+ * and density gradient. This dispersion is a superposition of the
+ * @ref dispersion_function_o_wave and @ref dispersion_function_x_wave
+ * dispersion relations. For a given frequency, one branch can cross cutoffs and
+ * resonances, the other cannot.
+ * @image html Cold-Plasma-Result.png "Wave trajectory with frequencies above the right-hand cutoff, ωr, for the cold plasma dispersion relation. The O-Mode branch can pass the right-hand cutoff, ωr, and the upper hybrid resonance, ωh, while X-mode cannot. The bottom plot tracks the resulting dispersion function residual, |D(k,ω)|, of the solver as the rays are traced."
+ * The figure above shows the wave trajectories for two waves above
+ * @f$\omega_{r}@f$. The O-Mode branch can pass @f$\omega_{r}@f$ while the
+ * X-Mode branch is reflected.
+ * @image html Cold-Plasma-Result2.png "Wave trajectory with frequencies between the upper hybrid resonance, ωh, and the left-hand cutoff, ωl, for the cold plasma dispersion relation. The X-Mode branch can pass the plasma frequency cutoff, ωpe, while O-mode cannot. The O-Mode can pass through the upper hybrid resonance, ωh, while the X-Mode branch is absorbed. The bottom plot tracks the resulting dispersion function residual, |D(k,ω)|, of the solver as the rays are traced."
+ * The figure above shows the wave trajectories for two waves between
+ * @f$\omega_{h}@f$ and @f$\omega_{l}@f$. The X-Mode branch can pass
+ * @f$\omega_{pe}@f$ cutoff while the O-Mode branch is reflected. The X-Mode
+ * branch is absorbed in the upper hybrid resonance, @f$\omega_{h}@f$, while the
+ * O-Mode branch can pass through it.
+ *
+ * @subsubsection use_cases_rf_correctness Comparison to GENRAY
+ * <a href="https://compxco.com/genray.html">Genray</a> is an RF-Ray tracing
+ * code written in Fortran which operates in a cylindrical geometry. Toroidal
+ * equilibria can be imported using the
+ * <a href="https://dx.doi.org/10.1088/0029-5515/25/11/007">EFIT</a> EQDSK
+ * format. Derivative information must either be explicitly defined or used a
+ * numerical differentiation. Since Genray does not trace within the vacuum
+ * region care was taken to ensure the two codes started from the same initial
+ * conditions. A single ray was traced in Genray then initial position and wave
+ * number was used to initialize the Graph framework ray tracer. Additionally
+ * the plasma profiles from Genray were matched with expression in the graph framework.
+ * @image html GenrayRays.png "Comparison of ray trajectory using the graph framework vs. the Genray code. Both codes show the same ray trajectory. The graph framework can continue to trace the rays in the vacuum region beyond the last closed flux surface."
+ * The figure above shows the trajectory of rays in the a tokamak cross-section.
+ * Both codes show a nearly identical trajectory with only a slight deviation
+ * towards the end of ray.
+ * @image html GenrayComponents.png "Comparison of x and k along the wave from Genray and the graph framework. Both codes show the same evolution of ray position and wave number as the ray is traced."
+ * The figure above shows the comparison of wave position @f$\vec{x}@f$ and wave
+ * number @f$\vec{k}@f$. Since the two codes use different time steps, these are
+ * plotted with respect to the flux label @f$\psi@f$. Both codes show nearly
+ * identical results with only a small deviation towards the end of the trace.
+ *
+ * @section use_cases_particle_pushing Particle Pushing
+ * In order to achieve good statistics for the evolution of particle
+ * distributions, it's necessary to push large numbers of particles.
+ * Exploiting GPU resources is necessary to achieve the large number of
+ * particles at reasonable run times to enable self consistent fields. An
+ * example is the runaway electron problem.
+ *
+ * During a disruption in a tokamak, electric fields can drive electron beams up
+ * to relativistic speeds. These high energy particles can lose confinement and
+ * damage first wall components. The
+ * <a href="https://dx.doi.org/10.1088/1367-2630/12/12/123005">Boris</a>
+ * leap-frog algorithm can integrate particles while conserving energy and
+ * momentum. The algorithm updates particle position @f$\vec{x}@f$, momentum
+ * @f$\vec{u}@f$, and relativistic @f$\gamma@f$.
+ * @image html ParticleTraces.png "Particle trajectories in a realistic tokamak geometry."
+ * The Figure above shows @f$100@f$ out of @f$10^{5}@f$ particles
+ * trajectories in a realistic tokamak geometry.
+ */
diff --git a/graph_framework.xcodeproj/project.pbxproj b/graph_framework.xcodeproj/project.pbxproj
index 58d124bb0c291872aa955fb65af1b8ad7a231ea9..c69db129984c6ee829cc5d76986fbb5460bde980 100644
--- a/graph_framework.xcodeproj/project.pbxproj
+++ b/graph_framework.xcodeproj/project.pbxproj
@@ -344,6 +344,9 @@
 		C723210222DC0D0A006BBF13 /* arithmetic.hpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.h; path = arithmetic.hpp; sourceTree = "<group>"; };
 		C72358F52C4027A10084A489 /* commandline_parser.hpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.h; path = commandline_parser.hpp; sourceTree = "<group>"; };
 		C725CD792840088000D0EDE2 /* physics_test.cpp */ = {isa = PBXFileReference; explicitFileType = sourcecode.cpp.objcpp; path = physics_test.cpp; sourceTree = "<group>"; };
+		C729B8B12ED7521A00A2559D /* discription.dox */ = {isa = PBXFileReference; lastKnownFileType = text; path = discription.dox; sourceTree = "<group>"; };
+		C729B8B22ED7536B00A2559D /* use_cases.dox */ = {isa = PBXFileReference; lastKnownFileType = text; path = use_cases.dox; sourceTree = "<group>"; };
+		C729B8B42ED77CA800A2559D /* code_performance.dox */ = {isa = PBXFileReference; lastKnownFileType = text; path = code_performance.dox; sourceTree = "<group>"; };
 		C736902B2A38AC0E001733B0 /* erfi_test.cpp */ = {isa = PBXFileReference; explicitFileType = sourcecode.cpp.objcpp; path = erfi_test.cpp; sourceTree = "<group>"; };
 		C73690312A38C498001733B0 /* erfi_test */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = erfi_test; sourceTree = BUILT_PRODUCTS_DIR; };
 		C736E6A42C9B526500AAE3C0 /* graph_playground */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = graph_playground; sourceTree = BUILT_PRODUCTS_DIR; };
@@ -623,6 +626,9 @@
 				C75C42932E5CA60B00B0950B /* main.dox */,
 				C75C42952E5CC80B00B0950B /* tutorial.dox */,
 				C7DD87D32E664B440058BA66 /* code_structure.dox */,
+				C729B8B12ED7521A00A2559D /* discription.dox */,
+				C729B8B22ED7536B00A2559D /* use_cases.dox */,
+				C729B8B42ED77CA800A2559D /* code_performance.dox */,
 			);
 			path = graph_docs;
 			sourceTree = "<group>";