Loading graph_framework/CMakeLists.txt +1 −0 Original line number Diff line number Diff line Loading @@ -26,6 +26,7 @@ target_compile_definitions (graph_framework $<$<BOOL:${SHOW_USE_COUNT}>:SHOW_USE_COUNT> $<$<BOOL:${USE_INDEX_CACHE}>:USE_INDEX_CACHE> $<IF:$<BOOL:${USE_VERBOSE}>,USE_VERBOSE=true,USE_VERBOSE=false> $<$<BOOL:${USE_METAL}>:HEADER_DIR="$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>"> ) target_include_directories (graph_framework Loading graph_framework/cuda_context.hpp +4 −2 Original line number Diff line number Diff line Loading @@ -972,11 +972,13 @@ namespace gpu { source_buffer << "input[i];" << std::endl; } source_buffer << " for (size_t index = i + 1024; index < " << size <<"; index += 1024) {" << std::endl; source_buffer << " sub_max = max(sub_max, "; if constexpr (jit::complex_scalar<T>) { source_buffer << " sub_max = max(sub_max, abs(input[index]));" << std::endl; source_buffer << "abs(input[index]"; } else { source_buffer << " sub_max = max(sub_max, input[index]);" << std::endl; source_buffer << "input[index]"; } source_buffer << ");" << std::endl; source_buffer << " }" << std::endl; source_buffer << " __shared__ " << jit::type_to_string<T> () << " thread_max[32];" << std::endl; source_buffer << " for (int index = 16; index > 0; index /= 2) {" << std::endl; Loading graph_framework/jit.hpp +1 −1 Original line number Diff line number Diff line Loading @@ -64,7 +64,7 @@ namespace jit { #ifdef USE_CUDA gpu::cuda_context<T, SAFE_MATH>, #elif defined(USE_METAL) gpu::metal_context<SAFE_MATH>, gpu::metal_context<T, SAFE_MATH>, #else gpu::cpu_context<T, SAFE_MATH>, #endif Loading graph_framework/metal_context.hpp +74 −24 Original line number Diff line number Diff line Loading @@ -9,6 +9,7 @@ #define metal_context_h #include <unordered_set> #include <stdlib.h> #import <Metal/Metal.h> Loading @@ -19,9 +20,10 @@ namespace gpu { //------------------------------------------------------------------------------ /// @brief Class representing a metal gpu context. /// /// @tparam T Base type of the calculation. /// @tparam SAFE_MATH Use @ref general_concepts_safe_math operations. //------------------------------------------------------------------------------ template<bool SAFE_MATH=false> template<jit::float_scalar T, bool SAFE_MATH=false> class metal_context { private: /// The metal device. Loading Loading @@ -82,6 +84,7 @@ namespace gpu { std::vector<std::string> names, const bool add_reduction=false) { NSError *error; setenv("MTL_HEADER_SEARCH_PATHS", HEADER_DIR, 1); library = [device newLibraryWithSource:[NSString stringWithCString:kernel_source.c_str() encoding:NSUTF8StringEncoding] options:compile_options() Loading Loading @@ -141,9 +144,9 @@ namespace gpu { std::set<graph::leaf_node<float, SAFE_MATH> *> needed_buffers; const size_t buffer_element_size = sizeof(float); for (graph::shared_variable<float, SAFE_MATH> &input : inputs) { for (graph::shared_variable<T, SAFE_MATH> &input : inputs) { if (!kernel_arguments.contains(input.get())) { backend::buffer<float> buffer = input->evaluate(); backend::buffer<T> buffer = input->evaluate(); kernel_arguments[input.get()] = [device newBufferWithBytes:buffer.data() length:buffer.size()*buffer_element_size options:MTLResourceStorageModeShared]; Loading @@ -155,7 +158,7 @@ namespace gpu { needed_buffers.insert(input.get()); } } for (graph::shared_leaf<float, SAFE_MATH> &output : outputs) { for (graph::shared_leaf<T, SAFE_MATH> &output : outputs) { if (!kernel_arguments.contains(output.get())) { kernel_arguments[output.get()] = [device newBufferWithLength:num_rays*sizeof(float) options:MTLResourceStorageModeShared]; Loading Loading @@ -296,7 +299,7 @@ namespace gpu { /// @param[in] run Function to run before reduction. /// @returns A lambda function to run the kernel. //------------------------------------------------------------------------------ std::function<float(void)> create_max_call(graph::shared_leaf<float, SAFE_MATH> &argument, std::function<T(void)> create_max_call(graph::shared_leaf<T, SAFE_MATH> &argument, std::function<void(void)> run) { MTLComputePipelineDescriptor *compute = [MTLComputePipelineDescriptor new]; compute.threadGroupSizeIsMultipleOfThreadExecutionWidth = YES; Loading Loading @@ -375,7 +378,7 @@ namespace gpu { /// @param[in] nodes Nodes to output. //------------------------------------------------------------------------------ void print_results(const size_t index, const graph::output_nodes<float, SAFE_MATH> &nodes) { const graph::output_nodes<T, SAFE_MATH> &nodes) { wait(); for (auto &out : nodes) { std::cout << static_cast<float *> ([kernel_arguments[out.get()] contents])[index] << " "; Loading @@ -390,10 +393,10 @@ namespace gpu { /// @param[in] node Node to check the value for. /// @returns The value at the index. //------------------------------------------------------------------------------ float check_value(const size_t index, const graph::shared_leaf<float, SAFE_MATH> &node) { T check_value(const size_t index, const graph::shared_leaf<T, SAFE_MATH> &node) { wait(); return static_cast<float *> ([kernel_arguments[node.get()] contents])[index]; return static_cast<T *> ([kernel_arguments[node.get()] contents])[index]; } //------------------------------------------------------------------------------ Loading @@ -402,8 +405,8 @@ namespace gpu { /// @param[in] node Not to copy buffer to. /// @param[in] source Host side buffer to copy from. //------------------------------------------------------------------------------ void copy_to_device(graph::shared_leaf<float, SAFE_MATH> node, float *source) { void copy_to_device(graph::shared_leaf<T, SAFE_MATH> node, T *source) { const size_t size = [kernel_arguments[node.get()] length]; memcpy([kernel_arguments[node.get()] contents], source, size); Loading @@ -415,8 +418,8 @@ namespace gpu { /// @param[in] node Node to copy buffer from. /// @param[in,out] destination Host side buffer to copy to. //------------------------------------------------------------------------------ void copy_to_host(graph::shared_leaf<float, SAFE_MATH> node, float *destination) { void copy_to_host(graph::shared_leaf<T, SAFE_MATH> node, T *destination) { command_buffer = [queue commandBuffer]; [command_buffer commit]; Loading @@ -436,6 +439,10 @@ namespace gpu { source_buffer << "#include <metal_stdlib>" << std::endl; source_buffer << "#include <metal_simdgroup>" << std::endl; source_buffer << "using namespace metal;" << std::endl; if constexpr (jit::complex_scalar<T>) { source_buffer << "#define METAL_DEVICE_CODE" << std::endl; source_buffer << "#include <special_functions.hpp>" << std::endl; } } //------------------------------------------------------------------------------ Loading Loading @@ -595,8 +602,22 @@ namespace gpu { << jit::to_string('v', in.get()) << "[index] = "; if constexpr (SAFE_MATH) { if constexpr (jit::complex_scalar<T>) { jit::add_type<T> (source_buffer); source_buffer << " ("; source_buffer << "isnan(real(" << registers[a.get()] << ")) ? 0.0 : real(" << registers[a.get()] << "), "; source_buffer << "isnan(imag(" << registers[a.get()] << ")) ? 0.0 : imag(" << registers[a.get()] << "));" << std::endl; } else { source_buffer << "isnan(" << registers[a.get()] << ") ? 0.0 : "; << ") ? 0.0 : " << registers[a.get()] << ";" << std::endl; } } else { source_buffer << registers[a.get()] << ";" << std::endl; } source_buffer << registers[a.get()] << ";" << std::endl; out_registers.insert(out.get()); Loading @@ -613,8 +634,22 @@ namespace gpu { source_buffer << " " << jit::to_string('o', out.get()) << "[index] = "; if constexpr (SAFE_MATH) { if constexpr (jit::complex_scalar<T>) { jit::add_type<T> (source_buffer); source_buffer << " ("; source_buffer << "isnan(real(" << registers[a.get()] << ")) ? 0.0 : real(" << registers[a.get()] << "), "; source_buffer << "isnan(imag(" << registers[a.get()] << ")) ? 0.0 : imag(" << registers[a.get()] << "));" << std::endl; } else { source_buffer << "isnan(" << registers[a.get()] << ") ? 0.0 : "; << ") ? 0.0 : " << registers[a.get()] << ";" << std::endl; } } else { source_buffer << registers[a.get()] << ";" << std::endl; } source_buffer << registers[a.get()] << ";" << std::endl; out_registers.insert(out.get()); Loading @@ -634,15 +669,30 @@ namespace gpu { const size_t size) { source_buffer << std::endl; source_buffer << "kernel void max_reduction(" << std::endl; source_buffer << " constant float *input [[buffer(0)]]," << std::endl; source_buffer << " device float *result [[buffer(1)]]," << std::endl; source_buffer << " constant "; jit::add_type<T> (source_buffer); source_buffer << " *input [[buffer(0)]]," << std::endl; source_buffer << " device "; jit::add_type<T> (source_buffer); source_buffer << " *result [[buffer(1)]]," << std::endl; source_buffer << " uint i [[thread_position_in_grid]]," << std::endl; source_buffer << " uint j [[simdgroup_index_in_threadgroup]]," << std::endl; source_buffer << " uint k [[thread_index_in_simdgroup]]) {" << std::endl; source_buffer << " if (i < " << size << ") {" << std::endl; source_buffer << " float sub_max = input[i];" << std::endl; source_buffer << " " << jit::type_to_string<T> () << " sub_max = "; if constexpr (jit::complex_scalar<T>) { source_buffer << "abs(input[i]);" << std::endl; } else { source_buffer << "input[i];" << std::endl; } source_buffer << " for (size_t index = i + 1024; index < " << size <<"; index += 1024) {" << std::endl; source_buffer << " sub_max = max(sub_max, input[index]);" << std::endl; source_buffer << " sub_max = max(sub_max, "; if constexpr (jit::complex_scalar<T>) { source_buffer << "abs(input[index]"; } else { source_buffer << "input[index]"; } source_buffer << ");" << std::endl; source_buffer << " }" << std::endl; source_buffer << " threadgroup float thread_max[32];" << std::endl; source_buffer << " thread_max[j] = simd_max(sub_max);" << std::endl; Loading @@ -659,8 +709,8 @@ namespace gpu { /// /// @param[in] node Node to get the buffer for. //------------------------------------------------------------------------------ float *get_buffer(graph::shared_leaf<float, SAFE_MATH> &node) { return static_cast<float *> ([kernel_arguments[node.get()] contents]); T *get_buffer(graph::shared_leaf<T, SAFE_MATH> &node) { return static_cast<T *> ([kernel_arguments[node.get()] contents]); } }; } Loading graph_framework/special_functions.hpp +251 −57 File changed.Preview size limit exceeded, changes collapsed. Show changes Loading
graph_framework/CMakeLists.txt +1 −0 Original line number Diff line number Diff line Loading @@ -26,6 +26,7 @@ target_compile_definitions (graph_framework $<$<BOOL:${SHOW_USE_COUNT}>:SHOW_USE_COUNT> $<$<BOOL:${USE_INDEX_CACHE}>:USE_INDEX_CACHE> $<IF:$<BOOL:${USE_VERBOSE}>,USE_VERBOSE=true,USE_VERBOSE=false> $<$<BOOL:${USE_METAL}>:HEADER_DIR="$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>"> ) target_include_directories (graph_framework Loading
graph_framework/cuda_context.hpp +4 −2 Original line number Diff line number Diff line Loading @@ -972,11 +972,13 @@ namespace gpu { source_buffer << "input[i];" << std::endl; } source_buffer << " for (size_t index = i + 1024; index < " << size <<"; index += 1024) {" << std::endl; source_buffer << " sub_max = max(sub_max, "; if constexpr (jit::complex_scalar<T>) { source_buffer << " sub_max = max(sub_max, abs(input[index]));" << std::endl; source_buffer << "abs(input[index]"; } else { source_buffer << " sub_max = max(sub_max, input[index]);" << std::endl; source_buffer << "input[index]"; } source_buffer << ");" << std::endl; source_buffer << " }" << std::endl; source_buffer << " __shared__ " << jit::type_to_string<T> () << " thread_max[32];" << std::endl; source_buffer << " for (int index = 16; index > 0; index /= 2) {" << std::endl; Loading
graph_framework/jit.hpp +1 −1 Original line number Diff line number Diff line Loading @@ -64,7 +64,7 @@ namespace jit { #ifdef USE_CUDA gpu::cuda_context<T, SAFE_MATH>, #elif defined(USE_METAL) gpu::metal_context<SAFE_MATH>, gpu::metal_context<T, SAFE_MATH>, #else gpu::cpu_context<T, SAFE_MATH>, #endif Loading
graph_framework/metal_context.hpp +74 −24 Original line number Diff line number Diff line Loading @@ -9,6 +9,7 @@ #define metal_context_h #include <unordered_set> #include <stdlib.h> #import <Metal/Metal.h> Loading @@ -19,9 +20,10 @@ namespace gpu { //------------------------------------------------------------------------------ /// @brief Class representing a metal gpu context. /// /// @tparam T Base type of the calculation. /// @tparam SAFE_MATH Use @ref general_concepts_safe_math operations. //------------------------------------------------------------------------------ template<bool SAFE_MATH=false> template<jit::float_scalar T, bool SAFE_MATH=false> class metal_context { private: /// The metal device. Loading Loading @@ -82,6 +84,7 @@ namespace gpu { std::vector<std::string> names, const bool add_reduction=false) { NSError *error; setenv("MTL_HEADER_SEARCH_PATHS", HEADER_DIR, 1); library = [device newLibraryWithSource:[NSString stringWithCString:kernel_source.c_str() encoding:NSUTF8StringEncoding] options:compile_options() Loading Loading @@ -141,9 +144,9 @@ namespace gpu { std::set<graph::leaf_node<float, SAFE_MATH> *> needed_buffers; const size_t buffer_element_size = sizeof(float); for (graph::shared_variable<float, SAFE_MATH> &input : inputs) { for (graph::shared_variable<T, SAFE_MATH> &input : inputs) { if (!kernel_arguments.contains(input.get())) { backend::buffer<float> buffer = input->evaluate(); backend::buffer<T> buffer = input->evaluate(); kernel_arguments[input.get()] = [device newBufferWithBytes:buffer.data() length:buffer.size()*buffer_element_size options:MTLResourceStorageModeShared]; Loading @@ -155,7 +158,7 @@ namespace gpu { needed_buffers.insert(input.get()); } } for (graph::shared_leaf<float, SAFE_MATH> &output : outputs) { for (graph::shared_leaf<T, SAFE_MATH> &output : outputs) { if (!kernel_arguments.contains(output.get())) { kernel_arguments[output.get()] = [device newBufferWithLength:num_rays*sizeof(float) options:MTLResourceStorageModeShared]; Loading Loading @@ -296,7 +299,7 @@ namespace gpu { /// @param[in] run Function to run before reduction. /// @returns A lambda function to run the kernel. //------------------------------------------------------------------------------ std::function<float(void)> create_max_call(graph::shared_leaf<float, SAFE_MATH> &argument, std::function<T(void)> create_max_call(graph::shared_leaf<T, SAFE_MATH> &argument, std::function<void(void)> run) { MTLComputePipelineDescriptor *compute = [MTLComputePipelineDescriptor new]; compute.threadGroupSizeIsMultipleOfThreadExecutionWidth = YES; Loading Loading @@ -375,7 +378,7 @@ namespace gpu { /// @param[in] nodes Nodes to output. //------------------------------------------------------------------------------ void print_results(const size_t index, const graph::output_nodes<float, SAFE_MATH> &nodes) { const graph::output_nodes<T, SAFE_MATH> &nodes) { wait(); for (auto &out : nodes) { std::cout << static_cast<float *> ([kernel_arguments[out.get()] contents])[index] << " "; Loading @@ -390,10 +393,10 @@ namespace gpu { /// @param[in] node Node to check the value for. /// @returns The value at the index. //------------------------------------------------------------------------------ float check_value(const size_t index, const graph::shared_leaf<float, SAFE_MATH> &node) { T check_value(const size_t index, const graph::shared_leaf<T, SAFE_MATH> &node) { wait(); return static_cast<float *> ([kernel_arguments[node.get()] contents])[index]; return static_cast<T *> ([kernel_arguments[node.get()] contents])[index]; } //------------------------------------------------------------------------------ Loading @@ -402,8 +405,8 @@ namespace gpu { /// @param[in] node Not to copy buffer to. /// @param[in] source Host side buffer to copy from. //------------------------------------------------------------------------------ void copy_to_device(graph::shared_leaf<float, SAFE_MATH> node, float *source) { void copy_to_device(graph::shared_leaf<T, SAFE_MATH> node, T *source) { const size_t size = [kernel_arguments[node.get()] length]; memcpy([kernel_arguments[node.get()] contents], source, size); Loading @@ -415,8 +418,8 @@ namespace gpu { /// @param[in] node Node to copy buffer from. /// @param[in,out] destination Host side buffer to copy to. //------------------------------------------------------------------------------ void copy_to_host(graph::shared_leaf<float, SAFE_MATH> node, float *destination) { void copy_to_host(graph::shared_leaf<T, SAFE_MATH> node, T *destination) { command_buffer = [queue commandBuffer]; [command_buffer commit]; Loading @@ -436,6 +439,10 @@ namespace gpu { source_buffer << "#include <metal_stdlib>" << std::endl; source_buffer << "#include <metal_simdgroup>" << std::endl; source_buffer << "using namespace metal;" << std::endl; if constexpr (jit::complex_scalar<T>) { source_buffer << "#define METAL_DEVICE_CODE" << std::endl; source_buffer << "#include <special_functions.hpp>" << std::endl; } } //------------------------------------------------------------------------------ Loading Loading @@ -595,8 +602,22 @@ namespace gpu { << jit::to_string('v', in.get()) << "[index] = "; if constexpr (SAFE_MATH) { if constexpr (jit::complex_scalar<T>) { jit::add_type<T> (source_buffer); source_buffer << " ("; source_buffer << "isnan(real(" << registers[a.get()] << ")) ? 0.0 : real(" << registers[a.get()] << "), "; source_buffer << "isnan(imag(" << registers[a.get()] << ")) ? 0.0 : imag(" << registers[a.get()] << "));" << std::endl; } else { source_buffer << "isnan(" << registers[a.get()] << ") ? 0.0 : "; << ") ? 0.0 : " << registers[a.get()] << ";" << std::endl; } } else { source_buffer << registers[a.get()] << ";" << std::endl; } source_buffer << registers[a.get()] << ";" << std::endl; out_registers.insert(out.get()); Loading @@ -613,8 +634,22 @@ namespace gpu { source_buffer << " " << jit::to_string('o', out.get()) << "[index] = "; if constexpr (SAFE_MATH) { if constexpr (jit::complex_scalar<T>) { jit::add_type<T> (source_buffer); source_buffer << " ("; source_buffer << "isnan(real(" << registers[a.get()] << ")) ? 0.0 : real(" << registers[a.get()] << "), "; source_buffer << "isnan(imag(" << registers[a.get()] << ")) ? 0.0 : imag(" << registers[a.get()] << "));" << std::endl; } else { source_buffer << "isnan(" << registers[a.get()] << ") ? 0.0 : "; << ") ? 0.0 : " << registers[a.get()] << ";" << std::endl; } } else { source_buffer << registers[a.get()] << ";" << std::endl; } source_buffer << registers[a.get()] << ";" << std::endl; out_registers.insert(out.get()); Loading @@ -634,15 +669,30 @@ namespace gpu { const size_t size) { source_buffer << std::endl; source_buffer << "kernel void max_reduction(" << std::endl; source_buffer << " constant float *input [[buffer(0)]]," << std::endl; source_buffer << " device float *result [[buffer(1)]]," << std::endl; source_buffer << " constant "; jit::add_type<T> (source_buffer); source_buffer << " *input [[buffer(0)]]," << std::endl; source_buffer << " device "; jit::add_type<T> (source_buffer); source_buffer << " *result [[buffer(1)]]," << std::endl; source_buffer << " uint i [[thread_position_in_grid]]," << std::endl; source_buffer << " uint j [[simdgroup_index_in_threadgroup]]," << std::endl; source_buffer << " uint k [[thread_index_in_simdgroup]]) {" << std::endl; source_buffer << " if (i < " << size << ") {" << std::endl; source_buffer << " float sub_max = input[i];" << std::endl; source_buffer << " " << jit::type_to_string<T> () << " sub_max = "; if constexpr (jit::complex_scalar<T>) { source_buffer << "abs(input[i]);" << std::endl; } else { source_buffer << "input[i];" << std::endl; } source_buffer << " for (size_t index = i + 1024; index < " << size <<"; index += 1024) {" << std::endl; source_buffer << " sub_max = max(sub_max, input[index]);" << std::endl; source_buffer << " sub_max = max(sub_max, "; if constexpr (jit::complex_scalar<T>) { source_buffer << "abs(input[index]"; } else { source_buffer << "input[index]"; } source_buffer << ");" << std::endl; source_buffer << " }" << std::endl; source_buffer << " threadgroup float thread_max[32];" << std::endl; source_buffer << " thread_max[j] = simd_max(sub_max);" << std::endl; Loading @@ -659,8 +709,8 @@ namespace gpu { /// /// @param[in] node Node to get the buffer for. //------------------------------------------------------------------------------ float *get_buffer(graph::shared_leaf<float, SAFE_MATH> &node) { return static_cast<float *> ([kernel_arguments[node.get()] contents]); T *get_buffer(graph::shared_leaf<T, SAFE_MATH> &node) { return static_cast<T *> ([kernel_arguments[node.get()] contents]); } }; } Loading
graph_framework/special_functions.hpp +251 −57 File changed.Preview size limit exceeded, changes collapsed. Show changes