diff --git a/graph_docs/Comparison.png b/graph_docs/Comparison.png index af33ffbff9ca889039a2f6d3897d648ef611f17b..293d5769b03acd3311744eea871c3943a960aa90 100644 Binary files a/graph_docs/Comparison.png and b/graph_docs/Comparison.png differ diff --git a/graph_docs/code_performance.dox b/graph_docs/code_performance.dox index 444574ae740f20aee347942267be8d867b918573..ac08f5bf44283eb146269e9e78db33e35ca5ec8d 100644 --- a/graph_docs/code_performance.dox +++ b/graph_docs/code_performance.dox @@ -45,12 +45,16 @@ * @f{equation}{\frac{\partial\vec{v}}{\partial t} = dt\vec{v}\times\vec{B}@f} * @f{equation}{\frac{\partial\vec{x}}{\partial t} = dt\vec{v}@f} * - * We compared the graph framework against the MLX framework since it supports - * Apple GPUs and JAX due to it's popularity. Source codes for this benchmark - * case is available in the appendix. Figure \ref{fig:compare} shows the through put of - * pushing $10^{8}$ particles for $10^{3}$ time steps. The graph framework - * consistently shows the best throughput on both CPUs and GPUs. Note MLX CPU - * throughput could by improved by splitting the problem to multiple threads. + * We compared the graph framework against the + * MLX + * framework since it supports Apple GPUs, + * JAX due to it's popularity, + * and Kokkos for its performance + * portability. Source codes for this benchmark case is available in the + * appendix. Figure \ref{fig:compare} shows the through put of pushing $10^{8}$ + * particles for $10^{3}$ time steps. The graph framework consistently shows the + * best throughput on both CPUs and GPUs. Note MLX CPU throughput could by + * improved by splitting the problem to multiple threads. * * @subsection code_performance_comparison_codes Source codes for throughput benchmark comparison * @subsubsection code_performance_comparison_graph Graph Framework @@ -93,7 +97,7 @@ for (size_t i = 0, ie = threads.size(); i < ie; i++) { auto v_next = v + dt*lorentz; auto pos_next = pos + dt*v_next; - workflow::manager work(0); + workflow::manager work(thread_number); work.add_item({ graph::variable_cast(x), graph::variable_cast(y), @@ -177,7 +181,7 @@ const auto total_time = end - start; def push(x, y, z, vx, vy, vz): dt = 0.000001 vx_next = vx + dt*(vy*1 - vz*0) - vy_next = vy + dt*(vz*0 - vy*1) + vy_next = vy + dt*(vz*0 - vx*1) vz_next = vz + dt*(vx*0 - vy*0) return vx_next, vy_next, vz_next, x + dt*vx_next, y + dt*vy_next, z + dt*vz_next @@ -201,6 +205,48 @@ jax.block_until_ready([x, y, z, vx, vy, vz]) end = time.time() print(end - start) + @endcode + * + * @subsubsection code_performance_comparison_kokkos Kokkos + * @code +const size_t size = 100000000; +const size_t steps = 1000; + +using ViewVectorType = Kokkos::View; +ViewVectorType x("x", size); +ViewVectorType y("y", size); +ViewVectorType z("z", size); + +ViewVectorType vx("vx", size); +ViewVectorType vy("vy", size); +ViewVectorType vz("vz", size); + +Kokkos::parallel_for(size, KOKKOS_LAMBDA(const int64_t index) { + vx[index] = 1; + vz[index] = 1; +}); + +const std::chrono::high_resolution_clock::time_point start = std::chrono::high_resolution_clock::now(); + +for (size_t i = 0; i < steps; i++) { + Kokkos::parallel_for(size, KOKKOS_LAMBDA(const int64_t index) { + const float dt = 0.000001; + const float vx_next = vx[index] + dt*(vy[index]*1 - vz[index]*0); + const float vy_next = vy[index] + dt*(vz[index]*0 - vx[index]*1); + const float vz_next = vz[index] + dt*(vx[index]*0 - vy[index]*0); + x[index] += dt*vx_next; + y[index] += dt*vy_next; + z[index] += dt*vz_next; + vx[index] = vx_next; + vy[index] = vy_next; + vz[index] = vz_next; + }); +} + +Kokkos::fence(); + +std::chrono::high_resolution_clock::time_point end = std::chrono::high_resolution_clock::now(); +const auto total_time = end - start; @endcode */ diff --git a/graph_docs/discription.dox b/graph_docs/discription.dox index 6cee61523c018e8802383080f648774b3d9cbc3d..b5017a11e545ae2346421cc249805b987d68a5d3 100644 --- a/graph_docs/discription.dox +++ b/graph_docs/discription.dox @@ -5,8 +5,8 @@ * @section discription_introduction Introduction * The basic functionality of this framework is to build expression graphs * representing mathematical equations. Reduce those graphs to simpler forms. - * Transform those graph to take derivatives. Just-In-Time (JIT) compile them to - * available compute device kernels. Then run those kernels in workflow. The + * Transform those graphs to take derivatives. Just-In-Time (JIT) compile them + * to available compute device kernels. Then run those kernels in workflows. The * code is written in using C++23 features. To simplify embedding into legacy * codes, there are additional language bindings for C and Fortran. * @@ -48,9 +48,10 @@ * be reduced to a single constant by calling the evaluate method. Sub-graph * expressions are combined, factored out, or moved to enable better reductions * on subsequent passes. As new ways of reducing the graph are implemented, - * current and existing code built using this framework benefit from improved - * speed. The figure above shows a visualization of the tree data structure for - * the equation of a line, the derivative, and the subsequent reductions. + * current and existing code built using this framework will benefit from + * improved speed. The figure above shows a visualization of the tree data + * structure for the equation of a line, the derivative, and the subsequent + * reductions. * * @subsubsection discription_graphs_builds Building Graphs * As an example building an expression of line @f$y=mx+b@f$ accomplished by @@ -79,8 +80,8 @@ auto dydmx = y->df(0.5*x); * running them in order. One @ref workflow::manager is created for each device * or thread. The user is responsible for creating threads. Each kernel is * generated through a @ref workflow::work_item. A work item is defined by - * kernel @ref graph::input_nodes, @ref graph::output_nodes and - * @ref graph::map_nodes. Map items are used to take the results of kernel and + * kernel @ref graph::input_nodes, @ref graph::output_nodes, and + * @ref graph::map_nodes. Map items are used to take the results of a kernel and * update an input buffer. Using our example of line equation, we can create a * workflow to compute @f$y@f$ and @f$\frac{\partial y}{\partial x}@f$. * @code @@ -99,7 +100,7 @@ work.add_item({ * elements in the inputs. Multiple work items can be created and will be * executed in order of creation. * - * Once the work items are defined that can be JIT compiled to a backend device. + * Once the work items are defined they can be JIT compiled to a backend device. * The graph framework supports back ends for generic CPUs, Apple Metal GPUs, * Nvidia Cuda GPUs, and initial HIP support of AMD GPUs. Each back end supplies * relevant driver code to build the kernel source, compile the kernel, build diff --git a/graph_docs/general.dox b/graph_docs/general.dox index 40b41f5e19916b6d8a3ab27e068afb368bb7b7b3..0034b55116c7c6fba124323e3f4b38f8e34548b8 100644 --- a/graph_docs/general.dox +++ b/graph_docs/general.dox @@ -39,7 +39,7 @@ * as either variables @f$x@f$ or constants @f$m,b@f$. These nodes are connected * by nodes for multiply and addition operations. The output @f$y@f$ represents * the entire graph of operations. - * @image{} html line_graph.png "The graph structure for @f$y=mx+b@f$." + * @image{} html line_graph.png "The graph structure for y = mx + b." * Evaluation of graphs start from the top most node in this case the @f$+@f$ * operation. Evaluation of a node is not performed until all sub-nodes are * evaluated starting with the left operand. Evaluation starts by recursively @@ -58,9 +58,10 @@ * graphs of a function derivative. For an example of taking derivatives see the * @ref tutorial_derivatives "auto differentiation tutorial". Lets say that we * want to take the derivative of @f$\frac{\partial y}{\partial x}@f$. This is - * achieved by evaluating the until bottom left most node is reached. Then a new - * graph is build starting with @f$\frac{\partial m}{\partial x}=0@f$. Applying - * the first half of the chain rule we build a new graph for @f$0x@f$ + * achieved by evaluating the graph until the bottom left most node is reached. + * Then a new graph is constucted starting with + * @f$\frac{\partial m}{\partial x}=0@f$. Applying the first half of the chain + * rule we build a new graph for @f$0x@f$ * @image{} html line_graph_dydf1.png "" * Then we take the derivative of the right operand and apply the second half * of the chain rule to build a new graph for @f$0x=0@f$. @@ -73,8 +74,8 @@ * The final expression for @f$\frac{\partial y}{\partial x}@f$ contains many * unnecessary nodes in the graph. Instead of building full graphs, we can * simplify and eliminate nodes as we build them. For instance, when the - * expression @f$0x@f$ this created can be immediately reduce it to a single - * node. + * expression @f$0\times x@f$ is created, this can be immediately reduced to a + * single node @f$0@f$. * @image{} html line_graph_reduce1.png "" * Applying all possible reductions reduces the final expression to * @f$\frac{\partial y}{\partial x}=m@f$. @@ -109,7 +110,7 @@ * @subsection general_concepts_compile_maps Maps * Maps enable the results of an output node to be stored in an input node. This * is used for a wide varity of cases. For instance take a gradient decent step. - * @f{equation}{y = y + \frac{\partial f}{\partial x}@f} + * @f{equation}{y_{i+1} = y_{i} + \frac{\partial f}{\partial x}@f} * In this case the output of the expression * @f$y + \frac{\partial f}{\partial x}@f$ * can be mapped to update @f$y@f$. @@ -122,7 +123,7 @@ *
* @section general_concepts_safe_math Safe Math * There are some conditions where mathematically, a graph should evaluate to a - * normal number. However, when evaluated suing floating point precision, can + * normal number. However, when evaluated using floating point precision, can * lead to Inf or NaN. An example of this the * @f$\exp\left(x\right)@f$ function. For large argument values, * @f$\exp\left(x\right)@f$ overflows the maximum floating point precision and diff --git a/graph_docs/tutorial.dox b/graph_docs/tutorial.dox index a59329ff38301e991deb015f43927bc020b27052..d3aaca42b42037383a1ec2d525da5203689712f8 100644 --- a/graph_docs/tutorial.dox +++ b/graph_docs/tutorial.dox @@ -13,7 +13,7 @@ * executable target which can be used to test out the API's of this framework. * The playground starts with a blank main function. * @code -#include "../graph_framework/jit.hpp" +#include "graph_framework.hpp" int main(int argc, const char * argv[]) { START_GPU @@ -30,7 +30,7 @@ int main(int argc, const char * argv[]) { * main. This will allow us to play with different floating point types. For now * we will start with a simple float type. * @code -#include "../graph_framework/jit.hpp" +#include "graph_framework.hpp" template void run_tutorial() { @@ -84,16 +84,16 @@ void run_tutorial() { * so all method are called using the -> operator. * * @subsection tutorial_constant Constant Nodes - * Next we want to define a constant. There are two method to define constants + * Next we want to define a constant. There are two methods to define constants * explicitly or implicitly. * @code template void run_tutorial() { auto x = graph::variable(1000, "x"); -// Define explicit constant. +// Define explicit constant. auto m = graph::constant (0.4); -// Define implicit constant. +// Define implicit constant. const T b = 0.6; } @endcode @@ -110,9 +110,9 @@ template void run_tutorial() { auto x = graph::variable(1000, "x"); -// Define explicit constant. +// Define explicit constant. auto m = graph::constant (0.4); -// Define implicit constant. +// Define implicit constant. const T b = 0.6; // Equation of a line @@ -133,15 +133,15 @@ template void run_tutorial() { auto x = graph::variable(1000, "x"); -// Define explicit constant. +// Define explicit constant. auto m = graph::constant (0.4); -// Define implicit constant. +// Define implicit constant. const T b = 0.6; -// Equation of a line +// Equation of a line auto y = m*x + b; -// Auto differentiation. +// Auto differentiation. auto dydx = y->df(x); dydx->to_latex(); std::cout << std::endl; @@ -168,18 +168,18 @@ template void run_tutorial() { auto x = graph::variable(3, "x"); -// Define explicit constant. +// Define explicit constant. auto m = graph::constant (0.4); -// Define implicit constant. +// Define implicit constant. const T b = 0.6; -// Equation of a line +// Equation of a line auto y = m*x + b; -// Auto differentiation. +// Auto differentiation. auto dydx = y->df(x); -// Create a workflow manager. +// Create a workflow manager. workflow::manager work(0); } @endcode @@ -322,13 +322,13 @@ void run_tutorial() { auto x = graph::variable (3, "x"); x->set({1.0, 2.0, 3.0}); -// Define an objective function. +// Define an objective function. auto f = 0.2*x*x*x + 0.6*x*x + 0.4*x + 0.5; // Define a step update. auto x_new = x - f/f->df(x); -// Create a workflow manager. +// Create a workflow manager. workflow::manager work(0); work.add_item({ graph::variable_cast(x) @@ -372,7 +372,7 @@ void run_tutorial() { * a reduction on the host side and transferring the entire array to the host. * To improve this we can use a converge item instead. * @code -// Create a workflow manager. +// Create a workflow manager. workflow::manager work(0); work.add_converge_item({ graph::variable_cast(x) diff --git a/graph_docs/use_cases.dox b/graph_docs/use_cases.dox index a2e7f530e35b6692f3262452f63c09f612e729c6..f47fa274899f10ca9e5d443faa528806d963924f 100644 --- a/graph_docs/use_cases.dox +++ b/graph_docs/use_cases.dox @@ -11,9 +11,9 @@ * @subsection use_cases_rf RF Ray tracing * Geometric optics is a set of asymptotic approximation methods to solve wave * equations. The physics of the particular wave determines an algebraic - * relation between $\omega$ and $\vec{k}$ called a dispersion relation, - * @f$D\left(\omega,\vec{k}\right)=0@f$. Since the parameter $t$ does not appear - * explicitly in the dispersion relation, the function + * relation between @f$\omega@f$ and @f$\vec{k}@f$ called a dispersion relation, + * @f$D\left(\omega,\vec{k}\right)=0@f$. Since the parameter @f$t@f$ does not + * appear explicitly in the dispersion relation, the function * @f$\omega\left(\vec{k}\left(t\right),\vec{x}\left(t\right)\right)@f$ is * constant along the ray trajectory * @f{equation}{\frac{\partial\omega}{\partial t}=\frac{\partial\omega}{\partial\vec{x}}\cdot\frac{\partial\vec{x}}{\partial t}+\frac{\partial\omega}{\partial\vec{k}}\cdot\frac{\partial\vec{k}}{\partial t}\equiv 0@f} @@ -41,7 +41,7 @@ * by relatively simple dispersion relations in plane stratified plasmas, that * is plasma with spatial variation only in the @f$x@f$ direction. In a * spatially varying medium, at a given frequency, there may be regions in which - * the solution of the dispersion relation, $\vec{k}$, is real, and the wave + * the solution of the dispersion relation, @f$\vec{k}@f$, is real, and the wave * propagates. In other regions @f$\vec{k}@f$ is imaginary and the wave does not * propagate, referred to as evanescent. The boundary between a region of * propagation and evanescence is a surface called a cut-off. It is also @@ -53,11 +53,11 @@ * behavior, and the behavior of rays in their vicinity is an indication of the * correctness of the solution. * - * For plasma, the spatial dependence of the dispersion relation comes through + * For plasmas, the spatial dependence of the dispersion relation comes through * variation of the plasma equilibrium quantities. These include the vector * magnetic field, @f$\vec{B}\left(x\right)@f$, the density of each plasma * particle species, @f$n_{s}\left(x\right)@f$, and the temperature of each - * particle species, @f$T_{s}\left(x\right)@f$, where $s$ indicates a + * particle species, @f$T_{s}\left(x\right)@f$, where @f$s@f$ indicates a * particular species. For the cases presented here a linear gradient along the * @f$x@f$ direction is taken for either the particle density or magnetic field * strength. diff --git a/graph_framework/cpu_context.hpp b/graph_framework/cpu_context.hpp index f7d677555bc6c4efc79aab6fcafcb1058a92d74c..9e27c9d6aeb981bc951d74dbf50afca3f77b6175 100644 --- a/graph_framework/cpu_context.hpp +++ b/graph_framework/cpu_context.hpp @@ -116,7 +116,7 @@ namespace gpu { //------------------------------------------------------------------------------ /// @brief Construct a cpu context. /// -/// @param[in] index Concurrent index. Not used. +/// @param[in] index Device index. Not used. //------------------------------------------------------------------------------ cpu_context(const size_t index) { llvm::InitializeNativeTarget(); diff --git a/graph_framework/cuda_context.hpp b/graph_framework/cuda_context.hpp index ee02e0bfc43248ed9b006f268699e6d980d90a43..dc660a09e2ea5b6a0cd02787028a746abeb8c34c 100644 --- a/graph_framework/cuda_context.hpp +++ b/graph_framework/cuda_context.hpp @@ -134,7 +134,7 @@ namespace gpu { //------------------------------------------------------------------------------ /// @brief Cuda context constructor. /// -/// @param[in] index Concurrent index. +/// @param[in] index Device index. //------------------------------------------------------------------------------ cuda_context(const size_t index) : result_buffer(0), module(0), offset_buffer(0) { check_error(cuDeviceGet(&device, index), "cuDeviceGet"); diff --git a/graph_framework/metal_context.hpp b/graph_framework/metal_context.hpp index ae78312c2c90c7571289a4ee16f58a01f1a96491..5e0bc964c49e0917a32eac8b4d1768300db51f2e 100644 --- a/graph_framework/metal_context.hpp +++ b/graph_framework/metal_context.hpp @@ -65,7 +65,7 @@ namespace gpu { //------------------------------------------------------------------------------ /// @brief Construct a metal context. /// -/// @param[in] index Concurrent index. +/// @param[in] index Device index. //------------------------------------------------------------------------------ metal_context(const size_t index) : device([MTLCopyAllDevices() objectAtIndex:index]), diff --git a/graph_framework/node.hpp b/graph_framework/node.hpp index ae729bd9da0539cfe26987604ee94f391043e5a9..9e675db406a1f43f142f5c66e289e6aa473c70f7 100644 --- a/graph_framework/node.hpp +++ b/graph_framework/node.hpp @@ -311,7 +311,7 @@ /// @code /// virtual shared_leaf remove_pseudo() { /// if (this->has_pseudo()) { -/// return sqrt(this->arg->remove_pseudo()); +/// return foo(this->arg->remove_pseudo()); /// } /// return this->shared_from_this(); /// } diff --git a/graph_framework/workflow.hpp b/graph_framework/workflow.hpp index 421442b79c8e1ef6f3ae47532ce4d65a40f13102..012892ca757eb0ae68ed8022ab70754abdc9351c 100644 --- a/graph_framework/workflow.hpp +++ b/graph_framework/workflow.hpp @@ -183,7 +183,13 @@ namespace workflow { //------------------------------------------------------------------------------ /// @brief Workflow manager constructor. /// -/// @param[in] index Concurrent index. +/// For GPU devices, this select the device number to run on. For CPU devices +/// this parameter is ignored. +/// +/// @note It is possible to create multiple workflow managers for the same +/// GPU device and may have performance benefits todo so. +/// +/// @param[in] index Device index. //------------------------------------------------------------------------------ manager(const size_t index) : context(index), add_reduction(false) {}