diff --git a/graph_docs/Comparison.png b/graph_docs/Comparison.png
index af33ffbff9ca889039a2f6d3897d648ef611f17b..293d5769b03acd3311744eea871c3943a960aa90 100644
Binary files a/graph_docs/Comparison.png and b/graph_docs/Comparison.png differ
diff --git a/graph_docs/code_performance.dox b/graph_docs/code_performance.dox
index 444574ae740f20aee347942267be8d867b918573..ac08f5bf44283eb146269e9e78db33e35ca5ec8d 100644
--- a/graph_docs/code_performance.dox
+++ b/graph_docs/code_performance.dox
@@ -45,12 +45,16 @@
* @f{equation}{\frac{\partial\vec{v}}{\partial t} = dt\vec{v}\times\vec{B}@f}
* @f{equation}{\frac{\partial\vec{x}}{\partial t} = dt\vec{v}@f}
*
- * We compared the graph framework against the MLX framework since it supports
- * Apple GPUs and JAX due to it's popularity. Source codes for this benchmark
- * case is available in the appendix. Figure \ref{fig:compare} shows the through put of
- * pushing $10^{8}$ particles for $10^{3}$ time steps. The graph framework
- * consistently shows the best throughput on both CPUs and GPUs. Note MLX CPU
- * throughput could by improved by splitting the problem to multiple threads.
+ * We compared the graph framework against the
+ * MLX
+ * framework since it supports Apple GPUs,
+ * JAX due to it's popularity,
+ * and Kokkos for its performance
+ * portability. Source codes for this benchmark case is available in the
+ * appendix. Figure \ref{fig:compare} shows the through put of pushing $10^{8}$
+ * particles for $10^{3}$ time steps. The graph framework consistently shows the
+ * best throughput on both CPUs and GPUs. Note MLX CPU throughput could by
+ * improved by splitting the problem to multiple threads.
*
* @subsection code_performance_comparison_codes Source codes for throughput benchmark comparison
* @subsubsection code_performance_comparison_graph Graph Framework
@@ -93,7 +97,7 @@ for (size_t i = 0, ie = threads.size(); i < ie; i++) {
auto v_next = v + dt*lorentz;
auto pos_next = pos + dt*v_next;
- workflow::manager work(0);
+ workflow::manager work(thread_number);
work.add_item({
graph::variable_cast(x),
graph::variable_cast(y),
@@ -177,7 +181,7 @@ const auto total_time = end - start;
def push(x, y, z, vx, vy, vz):
dt = 0.000001
vx_next = vx + dt*(vy*1 - vz*0)
- vy_next = vy + dt*(vz*0 - vy*1)
+ vy_next = vy + dt*(vz*0 - vx*1)
vz_next = vz + dt*(vx*0 - vy*0)
return vx_next, vy_next, vz_next,
x + dt*vx_next, y + dt*vy_next, z + dt*vz_next
@@ -201,6 +205,48 @@ jax.block_until_ready([x, y, z, vx, vy, vz])
end = time.time()
print(end - start)
+ @endcode
+ *
+ * @subsubsection code_performance_comparison_kokkos Kokkos
+ * @code
+const size_t size = 100000000;
+const size_t steps = 1000;
+
+using ViewVectorType = Kokkos::View;
+ViewVectorType x("x", size);
+ViewVectorType y("y", size);
+ViewVectorType z("z", size);
+
+ViewVectorType vx("vx", size);
+ViewVectorType vy("vy", size);
+ViewVectorType vz("vz", size);
+
+Kokkos::parallel_for(size, KOKKOS_LAMBDA(const int64_t index) {
+ vx[index] = 1;
+ vz[index] = 1;
+});
+
+const std::chrono::high_resolution_clock::time_point start = std::chrono::high_resolution_clock::now();
+
+for (size_t i = 0; i < steps; i++) {
+ Kokkos::parallel_for(size, KOKKOS_LAMBDA(const int64_t index) {
+ const float dt = 0.000001;
+ const float vx_next = vx[index] + dt*(vy[index]*1 - vz[index]*0);
+ const float vy_next = vy[index] + dt*(vz[index]*0 - vx[index]*1);
+ const float vz_next = vz[index] + dt*(vx[index]*0 - vy[index]*0);
+ x[index] += dt*vx_next;
+ y[index] += dt*vy_next;
+ z[index] += dt*vz_next;
+ vx[index] = vx_next;
+ vy[index] = vy_next;
+ vz[index] = vz_next;
+ });
+}
+
+Kokkos::fence();
+
+std::chrono::high_resolution_clock::time_point end = std::chrono::high_resolution_clock::now();
+const auto total_time = end - start;
@endcode
*/
diff --git a/graph_docs/discription.dox b/graph_docs/discription.dox
index 6cee61523c018e8802383080f648774b3d9cbc3d..b5017a11e545ae2346421cc249805b987d68a5d3 100644
--- a/graph_docs/discription.dox
+++ b/graph_docs/discription.dox
@@ -5,8 +5,8 @@
* @section discription_introduction Introduction
* The basic functionality of this framework is to build expression graphs
* representing mathematical equations. Reduce those graphs to simpler forms.
- * Transform those graph to take derivatives. Just-In-Time (JIT) compile them to
- * available compute device kernels. Then run those kernels in workflow. The
+ * Transform those graphs to take derivatives. Just-In-Time (JIT) compile them
+ * to available compute device kernels. Then run those kernels in workflows. The
* code is written in using C++23 features. To simplify embedding into legacy
* codes, there are additional language bindings for C and Fortran.
*
@@ -48,9 +48,10 @@
* be reduced to a single constant by calling the evaluate method. Sub-graph
* expressions are combined, factored out, or moved to enable better reductions
* on subsequent passes. As new ways of reducing the graph are implemented,
- * current and existing code built using this framework benefit from improved
- * speed. The figure above shows a visualization of the tree data structure for
- * the equation of a line, the derivative, and the subsequent reductions.
+ * current and existing code built using this framework will benefit from
+ * improved speed. The figure above shows a visualization of the tree data
+ * structure for the equation of a line, the derivative, and the subsequent
+ * reductions.
*
* @subsubsection discription_graphs_builds Building Graphs
* As an example building an expression of line @f$y=mx+b@f$ accomplished by
@@ -79,8 +80,8 @@ auto dydmx = y->df(0.5*x);
* running them in order. One @ref workflow::manager is created for each device
* or thread. The user is responsible for creating threads. Each kernel is
* generated through a @ref workflow::work_item. A work item is defined by
- * kernel @ref graph::input_nodes, @ref graph::output_nodes and
- * @ref graph::map_nodes. Map items are used to take the results of kernel and
+ * kernel @ref graph::input_nodes, @ref graph::output_nodes, and
+ * @ref graph::map_nodes. Map items are used to take the results of a kernel and
* update an input buffer. Using our example of line equation, we can create a
* workflow to compute @f$y@f$ and @f$\frac{\partial y}{\partial x}@f$.
* @code
@@ -99,7 +100,7 @@ work.add_item({
* elements in the inputs. Multiple work items can be created and will be
* executed in order of creation.
*
- * Once the work items are defined that can be JIT compiled to a backend device.
+ * Once the work items are defined they can be JIT compiled to a backend device.
* The graph framework supports back ends for generic CPUs, Apple Metal GPUs,
* Nvidia Cuda GPUs, and initial HIP support of AMD GPUs. Each back end supplies
* relevant driver code to build the kernel source, compile the kernel, build
diff --git a/graph_docs/general.dox b/graph_docs/general.dox
index 40b41f5e19916b6d8a3ab27e068afb368bb7b7b3..0034b55116c7c6fba124323e3f4b38f8e34548b8 100644
--- a/graph_docs/general.dox
+++ b/graph_docs/general.dox
@@ -39,7 +39,7 @@
* as either variables @f$x@f$ or constants @f$m,b@f$. These nodes are connected
* by nodes for multiply and addition operations. The output @f$y@f$ represents
* the entire graph of operations.
- * @image{} html line_graph.png "The graph structure for @f$y=mx+b@f$."
+ * @image{} html line_graph.png "The graph structure for y = mx + b."
* Evaluation of graphs start from the top most node in this case the @f$+@f$
* operation. Evaluation of a node is not performed until all sub-nodes are
* evaluated starting with the left operand. Evaluation starts by recursively
@@ -58,9 +58,10 @@
* graphs of a function derivative. For an example of taking derivatives see the
* @ref tutorial_derivatives "auto differentiation tutorial". Lets say that we
* want to take the derivative of @f$\frac{\partial y}{\partial x}@f$. This is
- * achieved by evaluating the until bottom left most node is reached. Then a new
- * graph is build starting with @f$\frac{\partial m}{\partial x}=0@f$. Applying
- * the first half of the chain rule we build a new graph for @f$0x@f$
+ * achieved by evaluating the graph until the bottom left most node is reached.
+ * Then a new graph is constucted starting with
+ * @f$\frac{\partial m}{\partial x}=0@f$. Applying the first half of the chain
+ * rule we build a new graph for @f$0x@f$
* @image{} html line_graph_dydf1.png ""
* Then we take the derivative of the right operand and apply the second half
* of the chain rule to build a new graph for @f$0x=0@f$.
@@ -73,8 +74,8 @@
* The final expression for @f$\frac{\partial y}{\partial x}@f$ contains many
* unnecessary nodes in the graph. Instead of building full graphs, we can
* simplify and eliminate nodes as we build them. For instance, when the
- * expression @f$0x@f$ this created can be immediately reduce it to a single
- * node.
+ * expression @f$0\times x@f$ is created, this can be immediately reduced to a
+ * single node @f$0@f$.
* @image{} html line_graph_reduce1.png ""
* Applying all possible reductions reduces the final expression to
* @f$\frac{\partial y}{\partial x}=m@f$.
@@ -109,7 +110,7 @@
* @subsection general_concepts_compile_maps Maps
* Maps enable the results of an output node to be stored in an input node. This
* is used for a wide varity of cases. For instance take a gradient decent step.
- * @f{equation}{y = y + \frac{\partial f}{\partial x}@f}
+ * @f{equation}{y_{i+1} = y_{i} + \frac{\partial f}{\partial x}@f}
* In this case the output of the expression
* @f$y + \frac{\partial f}{\partial x}@f$
* can be mapped to update @f$y@f$.
@@ -122,7 +123,7 @@
*
* @section general_concepts_safe_math Safe Math
* There are some conditions where mathematically, a graph should evaluate to a
- * normal number. However, when evaluated suing floating point precision, can
+ * normal number. However, when evaluated using floating point precision, can
* lead to Inf or NaN. An example of this the
* @f$\exp\left(x\right)@f$ function. For large argument values,
* @f$\exp\left(x\right)@f$ overflows the maximum floating point precision and
diff --git a/graph_docs/tutorial.dox b/graph_docs/tutorial.dox
index a59329ff38301e991deb015f43927bc020b27052..d3aaca42b42037383a1ec2d525da5203689712f8 100644
--- a/graph_docs/tutorial.dox
+++ b/graph_docs/tutorial.dox
@@ -13,7 +13,7 @@
* executable target which can be used to test out the API's of this framework.
* The playground starts with a blank main function.
* @code
-#include "../graph_framework/jit.hpp"
+#include "graph_framework.hpp"
int main(int argc, const char * argv[]) {
START_GPU
@@ -30,7 +30,7 @@ int main(int argc, const char * argv[]) {
* main. This will allow us to play with different floating point types. For now
* we will start with a simple float type.
* @code
-#include "../graph_framework/jit.hpp"
+#include "graph_framework.hpp"
template
void run_tutorial() {
@@ -84,16 +84,16 @@ void run_tutorial() {
* so all method are called using the -> operator.
*
* @subsection tutorial_constant Constant Nodes
- * Next we want to define a constant. There are two method to define constants
+ * Next we want to define a constant. There are two methods to define constants
* explicitly or implicitly.
* @code
template
void run_tutorial() {
auto x = graph::variable(1000, "x");
-// Define explicit constant.
+// Define explicit constant.
auto m = graph::constant (0.4);
-// Define implicit constant.
+// Define implicit constant.
const T b = 0.6;
}
@endcode
@@ -110,9 +110,9 @@ template
void run_tutorial() {
auto x = graph::variable(1000, "x");
-// Define explicit constant.
+// Define explicit constant.
auto m = graph::constant (0.4);
-// Define implicit constant.
+// Define implicit constant.
const T b = 0.6;
// Equation of a line
@@ -133,15 +133,15 @@ template
void run_tutorial() {
auto x = graph::variable(1000, "x");
-// Define explicit constant.
+// Define explicit constant.
auto m = graph::constant (0.4);
-// Define implicit constant.
+// Define implicit constant.
const T b = 0.6;
-// Equation of a line
+// Equation of a line
auto y = m*x + b;
-// Auto differentiation.
+// Auto differentiation.
auto dydx = y->df(x);
dydx->to_latex();
std::cout << std::endl;
@@ -168,18 +168,18 @@ template
void run_tutorial() {
auto x = graph::variable(3, "x");
-// Define explicit constant.
+// Define explicit constant.
auto m = graph::constant (0.4);
-// Define implicit constant.
+// Define implicit constant.
const T b = 0.6;
-// Equation of a line
+// Equation of a line
auto y = m*x + b;
-// Auto differentiation.
+// Auto differentiation.
auto dydx = y->df(x);
-// Create a workflow manager.
+// Create a workflow manager.
workflow::manager work(0);
}
@endcode
@@ -322,13 +322,13 @@ void run_tutorial() {
auto x = graph::variable (3, "x");
x->set({1.0, 2.0, 3.0});
-// Define an objective function.
+// Define an objective function.
auto f = 0.2*x*x*x + 0.6*x*x + 0.4*x + 0.5;
// Define a step update.
auto x_new = x - f/f->df(x);
-// Create a workflow manager.
+// Create a workflow manager.
workflow::manager work(0);
work.add_item({
graph::variable_cast(x)
@@ -372,7 +372,7 @@ void run_tutorial() {
* a reduction on the host side and transferring the entire array to the host.
* To improve this we can use a converge item instead.
* @code
-// Create a workflow manager.
+// Create a workflow manager.
workflow::manager work(0);
work.add_converge_item({
graph::variable_cast(x)
diff --git a/graph_docs/use_cases.dox b/graph_docs/use_cases.dox
index a2e7f530e35b6692f3262452f63c09f612e729c6..f47fa274899f10ca9e5d443faa528806d963924f 100644
--- a/graph_docs/use_cases.dox
+++ b/graph_docs/use_cases.dox
@@ -11,9 +11,9 @@
* @subsection use_cases_rf RF Ray tracing
* Geometric optics is a set of asymptotic approximation methods to solve wave
* equations. The physics of the particular wave determines an algebraic
- * relation between $\omega$ and $\vec{k}$ called a dispersion relation,
- * @f$D\left(\omega,\vec{k}\right)=0@f$. Since the parameter $t$ does not appear
- * explicitly in the dispersion relation, the function
+ * relation between @f$\omega@f$ and @f$\vec{k}@f$ called a dispersion relation,
+ * @f$D\left(\omega,\vec{k}\right)=0@f$. Since the parameter @f$t@f$ does not
+ * appear explicitly in the dispersion relation, the function
* @f$\omega\left(\vec{k}\left(t\right),\vec{x}\left(t\right)\right)@f$ is
* constant along the ray trajectory
* @f{equation}{\frac{\partial\omega}{\partial t}=\frac{\partial\omega}{\partial\vec{x}}\cdot\frac{\partial\vec{x}}{\partial t}+\frac{\partial\omega}{\partial\vec{k}}\cdot\frac{\partial\vec{k}}{\partial t}\equiv 0@f}
@@ -41,7 +41,7 @@
* by relatively simple dispersion relations in plane stratified plasmas, that
* is plasma with spatial variation only in the @f$x@f$ direction. In a
* spatially varying medium, at a given frequency, there may be regions in which
- * the solution of the dispersion relation, $\vec{k}$, is real, and the wave
+ * the solution of the dispersion relation, @f$\vec{k}@f$, is real, and the wave
* propagates. In other regions @f$\vec{k}@f$ is imaginary and the wave does not
* propagate, referred to as evanescent. The boundary between a region of
* propagation and evanescence is a surface called a cut-off. It is also
@@ -53,11 +53,11 @@
* behavior, and the behavior of rays in their vicinity is an indication of the
* correctness of the solution.
*
- * For plasma, the spatial dependence of the dispersion relation comes through
+ * For plasmas, the spatial dependence of the dispersion relation comes through
* variation of the plasma equilibrium quantities. These include the vector
* magnetic field, @f$\vec{B}\left(x\right)@f$, the density of each plasma
* particle species, @f$n_{s}\left(x\right)@f$, and the temperature of each
- * particle species, @f$T_{s}\left(x\right)@f$, where $s$ indicates a
+ * particle species, @f$T_{s}\left(x\right)@f$, where @f$s@f$ indicates a
* particular species. For the cases presented here a linear gradient along the
* @f$x@f$ direction is taken for either the particle density or magnetic field
* strength.
diff --git a/graph_framework/cpu_context.hpp b/graph_framework/cpu_context.hpp
index f7d677555bc6c4efc79aab6fcafcb1058a92d74c..9e27c9d6aeb981bc951d74dbf50afca3f77b6175 100644
--- a/graph_framework/cpu_context.hpp
+++ b/graph_framework/cpu_context.hpp
@@ -116,7 +116,7 @@ namespace gpu {
//------------------------------------------------------------------------------
/// @brief Construct a cpu context.
///
-/// @param[in] index Concurrent index. Not used.
+/// @param[in] index Device index. Not used.
//------------------------------------------------------------------------------
cpu_context(const size_t index) {
llvm::InitializeNativeTarget();
diff --git a/graph_framework/cuda_context.hpp b/graph_framework/cuda_context.hpp
index ee02e0bfc43248ed9b006f268699e6d980d90a43..dc660a09e2ea5b6a0cd02787028a746abeb8c34c 100644
--- a/graph_framework/cuda_context.hpp
+++ b/graph_framework/cuda_context.hpp
@@ -134,7 +134,7 @@ namespace gpu {
//------------------------------------------------------------------------------
/// @brief Cuda context constructor.
///
-/// @param[in] index Concurrent index.
+/// @param[in] index Device index.
//------------------------------------------------------------------------------
cuda_context(const size_t index) : result_buffer(0), module(0), offset_buffer(0) {
check_error(cuDeviceGet(&device, index), "cuDeviceGet");
diff --git a/graph_framework/metal_context.hpp b/graph_framework/metal_context.hpp
index ae78312c2c90c7571289a4ee16f58a01f1a96491..5e0bc964c49e0917a32eac8b4d1768300db51f2e 100644
--- a/graph_framework/metal_context.hpp
+++ b/graph_framework/metal_context.hpp
@@ -65,7 +65,7 @@ namespace gpu {
//------------------------------------------------------------------------------
/// @brief Construct a metal context.
///
-/// @param[in] index Concurrent index.
+/// @param[in] index Device index.
//------------------------------------------------------------------------------
metal_context(const size_t index) :
device([MTLCopyAllDevices() objectAtIndex:index]),
diff --git a/graph_framework/node.hpp b/graph_framework/node.hpp
index ae729bd9da0539cfe26987604ee94f391043e5a9..9e675db406a1f43f142f5c66e289e6aa473c70f7 100644
--- a/graph_framework/node.hpp
+++ b/graph_framework/node.hpp
@@ -311,7 +311,7 @@
/// @code
/// virtual shared_leaf remove_pseudo() {
/// if (this->has_pseudo()) {
-/// return sqrt(this->arg->remove_pseudo());
+/// return foo(this->arg->remove_pseudo());
/// }
/// return this->shared_from_this();
/// }
diff --git a/graph_framework/workflow.hpp b/graph_framework/workflow.hpp
index 421442b79c8e1ef6f3ae47532ce4d65a40f13102..012892ca757eb0ae68ed8022ab70754abdc9351c 100644
--- a/graph_framework/workflow.hpp
+++ b/graph_framework/workflow.hpp
@@ -183,7 +183,13 @@ namespace workflow {
//------------------------------------------------------------------------------
/// @brief Workflow manager constructor.
///
-/// @param[in] index Concurrent index.
+/// For GPU devices, this select the device number to run on. For CPU devices
+/// this parameter is ignored.
+///
+/// @note It is possible to create multiple workflow managers for the same
+/// GPU device and may have performance benefits todo so.
+///
+/// @param[in] index Device index.
//------------------------------------------------------------------------------
manager(const size_t index) : context(index), add_reduction(false) {}