diff --git a/graph_framework.xcodeproj/project.pbxproj b/graph_framework.xcodeproj/project.pbxproj index 7cb423293194be7d39ff046ab80e5c952ddd69ed..b230c5f3522efc8bb2b624d23eac949cc6bcab1b 100644 --- a/graph_framework.xcodeproj/project.pbxproj +++ b/graph_framework.xcodeproj/project.pbxproj @@ -1726,6 +1726,7 @@ "\"CXX_ARGS=\\\"-I/Users/m4c/Projects/graph_framework/graph_framework -I/usr/local/include -I/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/include/c++/v1 -I/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/lib/clang/17/include -I/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/include -I/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/include -I/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks -fgnuc-version=4.2.1 -std=gnu++2a\\\"\"", STATIC, "MACOS_LIB_RT=\\\"/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/lib/clang/17.0.0/lib/darwin/libclang_rt.osx.a\\\"", + USE_INDEX_CACHE, "USE_VERBOSE=false", "$(inherited)", ); @@ -1792,6 +1793,7 @@ "-lLLVMFrontendDriver", "-lLLVMFrontendHLSL", "-lLLVMFrontendOpenMP", + "-lLLVMFrontendDirective", "-lLLVMFrontendOffloading", "-lLLVMSelectionDAG", "-lLLVMProfileData", @@ -1813,6 +1815,7 @@ "-lLLVMAArch64Info", "-lLLVMAArch64Desc", "-lLLVMAArch64AsmParser", + "-lLLVMDebugInfoDWARFLowLevel", "-lLLVMAArch64CodeGen", "-lLLVMCGData", "-lLLVMSandboxIR", @@ -1958,6 +1961,7 @@ "-lLLVMFrontendDriver", "-lLLVMFrontendHLSL", "-lLLVMFrontendOpenMP", + "-lLLVMFrontendDirective", "-lLLVMFrontendOffloading", "-lLLVMSelectionDAG", "-lLLVMProfileData", @@ -1979,6 +1983,7 @@ "-lLLVMAArch64Info", "-lLLVMAArch64Desc", "-lLLVMAArch64AsmParser", + "-lLLVMDebugInfoDWARFLowLevel", "-lLLVMAArch64CodeGen", "-lLLVMCGData", "-lLLVMSandboxIR", diff --git a/graph_framework/metal_context.hpp b/graph_framework/metal_context.hpp index f8001591d06d956774943e9368b50c76d19bf7a5..f9974741c2812b184a5157a88ad6a337ae680f73 100644 --- a/graph_framework/metal_context.hpp +++ b/graph_framework/metal_context.hpp @@ -29,8 +29,6 @@ namespace gpu { std::map *, id> kernel_arguments; /// Textures. std::map> texture_arguments; -/// Max Buffer. - id result; /// Metal command buffer. id command_buffer; /// Metal library. @@ -144,14 +142,14 @@ namespace gpu { backend::buffer buffer = input->evaluate(); kernel_arguments[input.get()] = [device newBufferWithBytes:buffer.data() length:buffer.size()*buffer_element_size - options:MTLResourceStorageModeManaged]; + options:MTLResourceStorageModeShared]; } buffers.push_back(kernel_arguments[input.get()]); } for (graph::shared_leaf &output : outputs) { if (!kernel_arguments.contains(output.get())) { kernel_arguments[output.get()] = [device newBufferWithLength:num_rays*sizeof(float) - options:MTLResourceStorageModeManaged]; + options:MTLResourceStorageModeShared]; } buffers.push_back(kernel_arguments[output.get()]); } @@ -160,7 +158,7 @@ namespace gpu { kernel_arguments[state.get()] = [device newBufferWithBytes:state->data() length:state->get_size_bytes() options:MTLResourceCPUCacheModeWriteCombined | - MTLResourceStorageModeManaged | + MTLResourceStorageModeShared | MTLResourceHazardTrackingModeUntracked]; } buffers.push_back(kernel_arguments[state.get()]); @@ -302,7 +300,7 @@ namespace gpu { } id result = [device newBufferWithLength:sizeof(float) - options:MTLResourceStorageModeManaged]; + options:MTLResourceStorageModeShared]; id buffer = kernel_arguments[argument.get()]; @@ -329,14 +327,10 @@ namespace gpu { threadsPerThreadgroup:MTLSizeMake(1024, 1, 1)]; [encoder endEncoding]; - id blit = [command_buffer blitCommandEncoder]; - [blit synchronizeResource:result]; - [blit endEncoding]; - [command_buffer commit]; [command_buffer waitUntilCompleted]; - return static_cast ([result contents])[0]; + return static_cast (result.contents)[0]; }; } @@ -355,11 +349,6 @@ namespace gpu { //------------------------------------------------------------------------------ void wait() { command_buffer = [queue commandBuffer]; - id blit = [command_buffer blitCommandEncoder]; - for (const auto &[key, value] : kernel_arguments) { - [blit synchronizeResource:value]; - } - [blit endEncoding]; [command_buffer commit]; [command_buffer waitUntilCompleted]; @@ -416,16 +405,13 @@ namespace gpu { void copy_to_host(graph::shared_leaf node, float *destination) { command_buffer = [queue commandBuffer]; - id blit = [command_buffer blitCommandEncoder]; - [blit synchronizeResource:kernel_arguments[node.get()]]; - [blit endEncoding]; [command_buffer commit]; [command_buffer waitUntilCompleted]; memcpy(destination, - [kernel_arguments[node.get()] contents], - [kernel_arguments[node.get()] length]); + kernel_arguments[node.get()].contents, + kernel_arguments[node.get()].length); } //------------------------------------------------------------------------------ @@ -618,7 +604,7 @@ namespace gpu { const size_t size) { source_buffer << std::endl; source_buffer << "kernel void max_reduction(" << std::endl; - source_buffer << " device float *input [[buffer(0)]]," << std::endl; + source_buffer << " constant float *input [[buffer(0)]]," << std::endl; source_buffer << " device float *result [[buffer(1)]]," << std::endl; source_buffer << " uint i [[thread_position_in_grid]]," << std::endl; source_buffer << " uint j [[simdgroup_index_in_threadgroup]]," << std::endl; diff --git a/graph_framework/piecewise.hpp b/graph_framework/piecewise.hpp index 612e4b22bb346c171bc33315771037d619d8ebd1..9147353e906e4f2bba47d0c96e69834c2ccb0765 100644 --- a/graph_framework/piecewise.hpp +++ b/graph_framework/piecewise.hpp @@ -968,14 +968,17 @@ void compile_index(std::ostringstream &stream, const size_t length = leaf_node::caches.backends[data_hash].size(); const size_t num_rows = length/num_columns; + shared_leaf x = this->left->compile(stream, + registers, + indices, + usage); + shared_leaf y = this->right->compile(stream, + registers, + indices, + usage); + #ifdef USE_INDEX_CACHE - if (indices.find(this->left.get()) == indices.end()) { -#endif - shared_leaf x = this->left->compile(stream, - registers, - indices, - usage); -#ifdef USE_INDEX_CACHE + if (indices.find(x.get()) == indices.end()) { indices[x.get()] = jit::to_string('i', x.get()); stream << " const " << jit::smallest_int_type (num_rows) << " " @@ -984,13 +987,7 @@ void compile_index(std::ostringstream &stream, x_scale, x_offset); x->endline(stream, usage); } - if (indices.find(this->right.get()) == indices.end()) { -#endif - shared_leaf y = this->right->compile(stream, - registers, - indices, - usage); -#ifdef USE_INDEX_CACHE + if (indices.find(y.get()) == indices.end()) { indices[y.get()] = jit::to_string('i', y.get()); stream << " const " << jit::smallest_int_type (num_columns) << " " @@ -1012,9 +1009,9 @@ void compile_index(std::ostringstream &stream, stream << " const " << jit::smallest_int_type (length) << " " << indices[temp.get()] << " = " - << indices[this->left.get()] + << indices[x.get()] << "*" << num_columns << " + " - << indices[this->right.get()] + << indices[y.get()] << ";" << std::endl; } } @@ -1048,9 +1045,9 @@ void compile_index(std::ostringstream &stream, << jit::smallest_int_type (std::max(num_rows, num_columns)) << "2(" - << indices[this->right.get()] + << indices[y.get()] << "," - << indices[this->left.get()] + << indices[x.get()] << ")).r"; #else stream << ".read(uint2("; @@ -1065,9 +1062,9 @@ void compile_index(std::ostringstream &stream, } else if constexpr (jit::use_cuda()) { #ifdef USE_INDEX_CACHE stream << ", " - << indices[this->right.get()] + << indices[y.get()] << ", " - << indices[this->left.get()]; + << indices[x.get()]; #else stream << ", "; compile_index (stream, registers[y.get()], num_columns, diff --git a/graph_framework/workflow.hpp b/graph_framework/workflow.hpp index 880d5d6fcd77a2394eddc905d3ee52aa5241bbde..c33f525f6958134fc7b74e4be4667e10c2d8fbc8 100644 --- a/graph_framework/workflow.hpp +++ b/graph_framework/workflow.hpp @@ -136,9 +136,9 @@ namespace workflow { T max_residule = max_kernel(); T last_max = std::numeric_limits::max(); T off_last_max = std::numeric_limits::max(); - while (std::abs(max_residule) > std::abs(tolarance) && - std::abs(last_max - max_residule) > std::abs(tolarance) && - std::abs(off_last_max - max_residule) > std::abs(tolarance) && + while (std::abs(max_residule) > std::abs(tolarance) && + std::abs(last_max - max_residule) > std::abs(tolarance) && + std::abs(off_last_max - max_residule) > std::abs(tolarance) && iterations++ < max_iterations) { last_max = max_residule; if (!(iterations%2)) {