From 1bba57c078f3507569f26db2d48aee5694c6a14c Mon Sep 17 00:00:00 2001 From: cianciosa Date: Tue, 28 Jan 2025 17:58:19 -0500 Subject: [PATCH 1/3] Fix cuda issuew where constant literals would cause an ambiguous fma type selection. Reduce divides of divides. --- graph_framework.xcodeproj/project.pbxproj | 174 ++++++++++++++++++++++ graph_framework/arithmetic.hpp | 16 +- graph_framework/node.hpp | 5 +- graph_korc/xkorc.cpp | 6 +- graph_tests/arithmetic_test.cpp | 11 +- 5 files changed, 200 insertions(+), 12 deletions(-) diff --git a/graph_framework.xcodeproj/project.pbxproj b/graph_framework.xcodeproj/project.pbxproj index fc44df7..6a8b85a 100644 --- a/graph_framework.xcodeproj/project.pbxproj +++ b/graph_framework.xcodeproj/project.pbxproj @@ -2283,6 +2283,93 @@ CODE_SIGN_STYLE = Automatic; DEAD_CODE_STRIPPING = YES; MACOSX_DEPLOYMENT_TARGET = 13.3; + OTHER_LDFLAGS = ( + "-lnetcdf", + "-ld_classic", + "-L/Users/m4c/Projects/graph_framework/build/_deps/llvm-build/lib", + "-lz", + "-lLLVMCoverage", + "-lLLVMSupport", + "-lLLVMDebugInfoCodeView", + "-lLLVMRemarks", + "-lLLVMJITLink", + "-lLLVMLinker", + "-lLLVMTextAPI", + "-lLLVMRuntimeDyld", + "-lLLVMOrcShared", + "-lLLVMOrcDebugging", + "-lLLVMOrcTargetProcess", + "-lLLVMOrcJIT", + "-lLLVMHipStdPar", + "-lLLVMAggressiveInstCombine", + "-lLLVMVectorize", + "-lLLVMAsmParser", + "-lLLVMOption", + "-lLLVMLTO", + "-lLLVMObject", + "-lLLVMWindowsDriver", + "-lLLVMDemangle", + "-lLLVMIRReader", + "-lLLVMIRPrinter", + "-lLLVMInstCombine", + "-lLLVMBinaryFormat", + "-lLLVMCoroutines", + "-lLLVMBitstreamReader", + "-lLLVMBitReader", + "-lLLVMBitWriter", + "-lLLVMDebugInfoDWARF", + "-lLLVMInstrumentation", + "-lLLVMCFGuard", + "-lLLVMObjCARCOpts", + "-lLLVMipo", + "-lLLVMGlobalISel", + "-lLLVMExecutionEngine", + "-lLLVMFrontendDriver", + "-lLLVMFrontendHLSL", + "-lLLVMFrontendOpenMP", + "-lLLVMFrontendOffloading", + "-lLLVMSelectionDAG", + "-lLLVMProfileData", + "-lLLVMAnalysis", + "-lLLVMScalarOpts", + "-lLLVMCodeGenTypes", + "-lLLVMCodeGenData", + "-lLLVMCodeGen", + "-lLLVMTargetParser", + "-lLLVMScalarOpts", + "-lLLVMTarget", + "-lLLVMTransformUtils", + "-lLLVMPasses", + "-lLLVMSupport", + "-lLLVMMCParser", + "-lLLVMMC", + "-lLLVMCore", + "-lLLVMAsmPrinter", + "-lLLVMAArch64Utils", + "-lLLVMAArch64Info", + "-lLLVMAArch64Desc", + "-lLLVMAArch64AsmParser", + "-lLLVMCGData", + "-lLLVMSandboxIR", + "-lLLVMFrontendAtomic", + "-lLLVMAArch64CodeGen", + "-lclangFrontend", + "-lclangBasic", + "-lclangEdit", + "-lclangLex", + "-lclangDriver", + "-lclangSerialization", + "-lclangAST", + "-lclangSema", + "-lclangAnalysis", + "-lclangASTMatchers", + "-lclangSupport", + "-lclangParse", + "-lclangAPINotes", + "-lclangCodeGen", + "-rpath", + /usr/local/lib, + ); PRODUCT_NAME = "$(TARGET_NAME)"; SDKROOT = macosx; }; @@ -2296,6 +2383,93 @@ CODE_SIGN_STYLE = Automatic; DEAD_CODE_STRIPPING = YES; MACOSX_DEPLOYMENT_TARGET = 13.3; + OTHER_LDFLAGS = ( + "-lnetcdf", + "-ld_classic", + "-L/Users/m4c/Projects/graph_framework/build/_deps/llvm-build/lib", + "-lz", + "-lLLVMCoverage", + "-lLLVMSupport", + "-lLLVMDebugInfoCodeView", + "-lLLVMRemarks", + "-lLLVMJITLink", + "-lLLVMLinker", + "-lLLVMTextAPI", + "-lLLVMRuntimeDyld", + "-lLLVMOrcShared", + "-lLLVMOrcDebugging", + "-lLLVMOrcTargetProcess", + "-lLLVMOrcJIT", + "-lLLVMHipStdPar", + "-lLLVMAggressiveInstCombine", + "-lLLVMVectorize", + "-lLLVMAsmParser", + "-lLLVMOption", + "-lLLVMLTO", + "-lLLVMObject", + "-lLLVMWindowsDriver", + "-lLLVMDemangle", + "-lLLVMIRReader", + "-lLLVMIRPrinter", + "-lLLVMInstCombine", + "-lLLVMBinaryFormat", + "-lLLVMCoroutines", + "-lLLVMBitstreamReader", + "-lLLVMBitReader", + "-lLLVMBitWriter", + "-lLLVMDebugInfoDWARF", + "-lLLVMInstrumentation", + "-lLLVMCFGuard", + "-lLLVMObjCARCOpts", + "-lLLVMipo", + "-lLLVMGlobalISel", + "-lLLVMExecutionEngine", + "-lLLVMFrontendDriver", + "-lLLVMFrontendHLSL", + "-lLLVMFrontendOpenMP", + "-lLLVMFrontendOffloading", + "-lLLVMSelectionDAG", + "-lLLVMProfileData", + "-lLLVMAnalysis", + "-lLLVMScalarOpts", + "-lLLVMCodeGenTypes", + "-lLLVMCodeGenData", + "-lLLVMCodeGen", + "-lLLVMTargetParser", + "-lLLVMScalarOpts", + "-lLLVMTarget", + "-lLLVMTransformUtils", + "-lLLVMPasses", + "-lLLVMSupport", + "-lLLVMMCParser", + "-lLLVMMC", + "-lLLVMCore", + "-lLLVMAsmPrinter", + "-lLLVMAArch64Utils", + "-lLLVMAArch64Info", + "-lLLVMAArch64Desc", + "-lLLVMAArch64AsmParser", + "-lLLVMCGData", + "-lLLVMSandboxIR", + "-lLLVMFrontendAtomic", + "-lLLVMAArch64CodeGen", + "-lclangFrontend", + "-lclangBasic", + "-lclangEdit", + "-lclangLex", + "-lclangDriver", + "-lclangSerialization", + "-lclangAST", + "-lclangSema", + "-lclangAnalysis", + "-lclangASTMatchers", + "-lclangSupport", + "-lclangParse", + "-lclangAPINotes", + "-lclangCodeGen", + "-rpath", + /usr/local/lib, + ); PRODUCT_NAME = "$(TARGET_NAME)"; SDKROOT = macosx; }; diff --git a/graph_framework/arithmetic.hpp b/graph_framework/arithmetic.hpp index 3d93efa..6be028d 100644 --- a/graph_framework/arithmetic.hpp +++ b/graph_framework/arithmetic.hpp @@ -2883,10 +2883,15 @@ namespace graph { } // (a/b)/c -> a/(b*c) +// a/(b/c) -> a*c/b auto ld = divide_cast(this->left); + auto rd = divide_cast(this->right); if (ld.get()) { return ld->get_left()/(ld->get_right()*this->right); } + if (rd.get()) { + return this->left*rd->get_right()/rd->get_left(); + } // Power reductions. if (is_variable_combineable(this->left, @@ -3223,7 +3228,6 @@ namespace graph { // exp(a)/(c/exp(b)) -> (exp(a)*exp(b))/c // exp(a)/(exp(b)/c) -> c*(exp(a)/exp(b)) - auto rd = divide_cast(this->right); if (rd.get() && lexp.get()) { auto rdre = exp_cast(rd->get_right()); if (rdre.get()) { @@ -3813,8 +3817,8 @@ namespace graph { } } if (is_constant_combineable(this->middle, - rd->get_left()) && - !this->middle->has_constant_zero()) { + rd->get_left()) && + !this->middle->has_constant_zero()) { auto temp = rd->get_left()/this->middle; if (temp->is_normal()) { return this->middle*(this->left + @@ -3836,6 +3840,12 @@ namespace graph { // Common denominator reductions. if (ld.get() && rd.get()) { +// fma(b/c,a,b,d) -> b(a/c + 1/d) + if (ld->get_left()->is_match(rd->get_left())) { + return ld->get_left()*(this->middle/ld->get_right() + + 1.0/rd->get_right()); + } + // fma(a/(b*c),d,e/c) -> fma(a,d,e*b)/(b*c) // fma(a/(c*b),d,e/c) -> fma(a,d,e*b)/(c*b) // fma(a/c,d,e/(c*b)) -> fma(a*b,d,e)/(b*c) diff --git a/graph_framework/node.hpp b/graph_framework/node.hpp index ad1bd7c..948f529 100644 --- a/graph_framework/node.hpp +++ b/graph_framework/node.hpp @@ -385,7 +385,7 @@ namespace graph { static std::string to_string(const T d) { return jit::format_to_string (d); } - + private: /// Storage buffer for the data. const backend::buffer data; @@ -464,7 +464,8 @@ namespace graph { + jit::format_to_string(this->evaluate().at(0)) + ")"; } else { - registers[this] = jit::format_to_string(this->evaluate().at(0)); + registers[this] = "(" + jit::get_type_string () + ")" + + jit::format_to_string(this->evaluate().at(0)); } #endif } diff --git a/graph_korc/xkorc.cpp b/graph_korc/xkorc.cpp index 30a9e24..6174b84 100644 --- a/graph_korc/xkorc.cpp +++ b/graph_korc/xkorc.cpp @@ -60,7 +60,7 @@ void run_korc() { auto gamma = graph::variable (local_num_particles, "\\gamma"); - auto dt = graph::constant (0.25); + auto dt = graph::constant (0.5); auto gamma_init = 1.0/graph::sqrt(1.0 - u_vec->dot(u_vec)); @@ -143,11 +143,11 @@ void run_korc() { const timeing::measure_diagnostic t_run("Run Time"); work.pre_run(); for (size_t i = 0; i < 1000000; i++) { - /*sync.join(); + sync.join(); work.wait(); sync = std::thread([&file, &dataset] () -> void { dataset.write(file); - });*/ + }); work.run(); } diff --git a/graph_tests/arithmetic_test.cpp b/graph_tests/arithmetic_test.cpp index 1d25ed4..5d34db0 100644 --- a/graph_tests/arithmetic_test.cpp +++ b/graph_tests/arithmetic_test.cpp @@ -2597,6 +2597,9 @@ template void test_divide() { // (c*a)*b/c -> a*b assert((((c*a)*b)/c)->is_match(a*b) && "Expected a*b"); +// a/(b/c) -> a*c/b + assert((a/(b/c))->is_match(a*c/b) && "Expected a*b/c"); + // (a*b*c)^2/a^2 -> (b*c)^2 // (a*b*c)^2/(a^2*d) -> (b*c)^2/d // (e*(a*b*c)^2)/(a^2*d) -> e*(b*c)^2/d @@ -2713,9 +2716,9 @@ template void test_fma() { "Expected a value of one."); // Test reduction. - auto var_a = graph::variable (1, ""); - auto var_b = graph::variable (1, ""); - auto var_c = graph::variable (1, ""); + auto var_a = graph::variable (1, "a"); + auto var_b = graph::variable (1, "b"); + auto var_c = graph::variable (1, "c"); // fma(1,a,b) = a + b auto one_times_vara_plus_varb = graph::fma(one, var_a, var_b); @@ -2764,7 +2767,7 @@ template void test_fma() { "Expected common var_b"); // fma(a, b, fma(c, b, d)) -> fma(b, a + c, d) - auto var_d = graph::variable (1, ""); + auto var_d = graph::variable (1, "d"); auto match1 = graph::fma(var_b, var_a + var_c, var_d); auto nested_fma1 = graph::fma(var_a, var_b, graph::fma(var_c, var_b, var_d)); -- GitLab From 84841377c11aea7ecbc4b1e4ca15c47202f143ad Mon Sep 17 00:00:00 2001 From: m4c Date: Wed, 29 Jan 2025 11:11:42 -0500 Subject: [PATCH 2/3] Adjust test tolarance for cuda only runs. --- graph_tests/solver_test.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/graph_tests/solver_test.cpp b/graph_tests/solver_test.cpp index 97c4154..9da2e4e 100644 --- a/graph_tests/solver_test.cpp +++ b/graph_tests/solver_test.cpp @@ -110,7 +110,11 @@ int main(int argc, const char * argv[]) { (void)argv; run_tests (4.0E-15); run_tests (1.0E-30); - run_tests> (4.0E-15); + if constexpr (jit::use_cuda()) { + run_tests> (5.6E-15); + } else { + run_tests> (4.0E-15); + } run_tests> (1.0E-30); END_GPU } -- GitLab From 5b58c0fc0f53975bbfb49927870e040803b3386c Mon Sep 17 00:00:00 2001 From: m4c Date: Wed, 29 Jan 2025 12:14:22 -0500 Subject: [PATCH 3/3] Remove debugging codes and fix white space. --- graph_tests/arithmetic_test.cpp | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/graph_tests/arithmetic_test.cpp b/graph_tests/arithmetic_test.cpp index 5d34db0..5577346 100644 --- a/graph_tests/arithmetic_test.cpp +++ b/graph_tests/arithmetic_test.cpp @@ -1179,7 +1179,7 @@ template void test_multiply() { assert(var_times_var_result.size() == 1 && "Expected single value."); assert(var_times_var_result.at(0) == static_cast (36) && "Expected 6*6 for result."); - + // Test c1*(c2*v) -> c3*v auto c3 = 2.0*(3.0*a); auto c3_cast = graph::multiply_cast(c3); @@ -1660,7 +1660,7 @@ template void test_multiply() { assert(exp_mul13_cast.get() && "Expected divide node."); assert(graph::exp_cast(exp_mul13_cast->get_left()).get() && "Expected a exp node on the left."); - + // cos(v)*a -> a*cos(v) auto cosine = graph::cos(variable); auto sine = graph::sin(variable); @@ -1794,7 +1794,7 @@ template void test_multiply() { "Expected a divide node."); assert(todivide1->is_match((var_a*var_b)/var_c) && "Expected a (a*b)/c"); - + // e1*(e2*v) -> (e1*e2)*v auto promote_var = var_b*(var_c*a); auto promote_var_cast = graph::multiply_cast(promote_var); @@ -1823,7 +1823,7 @@ template void test_multiply() { assert(promote_var4_cast->get_right()->is_match(a*a) && "Expected a^2"); assert(promote_var4_cast->get_left()->is_match(var_b*var_c) && "Expected (2 + b)*(3 + c)"); - + // (a*b)*a -> a^2*b auto gather = (var_a*var_b)*var_a; auto gather_cast = graph::multiply_cast(gather); @@ -1931,7 +1931,7 @@ template void test_divide() { "Expected to recover numerator."); assert((zero/variable)->evaluate()[0] == static_cast (0.0) && "Expected a value of zero."); - + auto two_divided_var = 2.0/variable; assert(graph::divide_cast(two_divided_var).get() && "Expected divide node."); @@ -2267,7 +2267,7 @@ template void test_divide() { auto fma_divide6 = graph::fma(graph::variable (1, ""), a, a)/a; auto fma_divide6_cast = graph::add_cast(fma_divide6); assert(fma_divide6_cast.get() && "Expected an add node."); - + // (a*b^c)/b^d -> a*b^(c - d) auto common_power = (variable*graph::pow(a, 3.0))/graph::pow(a, 2.0); assert(graph::multiply_cast(common_power).get() && @@ -2633,7 +2633,7 @@ template void test_fma() { "Expected two."); assert(one_times_zero_plus_two->evaluate()[0] == static_cast (2.0) && "Expected a value of two."); - + auto one_times_two_plus_zero = graph::fma(one, two, zero); auto one_times_two_plus_zero_cast = graph::constant_cast(one_times_two_plus_zero); @@ -2687,7 +2687,7 @@ template void test_fma() { assert(constant_df_cast->is(0) && "Expected zero."); assert(constant_df->evaluate()[0] == static_cast (0.0) && "Expected a value of zero."); - + auto zero_times_var_plus_two_df = zero_times_var_plus_two->df(var); auto zero_times_var_plus_two_df_cast = graph::constant_cast(zero_times_var_plus_two_df); @@ -2716,16 +2716,16 @@ template void test_fma() { "Expected a value of one."); // Test reduction. - auto var_a = graph::variable (1, "a"); - auto var_b = graph::variable (1, "b"); - auto var_c = graph::variable (1, "c"); + auto var_a = graph::variable (1, ""); + auto var_b = graph::variable (1, ""); + auto var_c = graph::variable (1, ""); // fma(1,a,b) = a + b auto one_times_vara_plus_varb = graph::fma(one, var_a, var_b); auto one_times_vara_plus_varb_cast = graph::add_cast(one_times_vara_plus_varb); assert(one_times_vara_plus_varb_cast.get() && "Expected an add node."); - + // fma(a,1,b) = a + b auto vara_times_one_plus_varb = graph::fma(var_a, one, var_b); auto vara_times_one_plus_varb_cast = @@ -2767,7 +2767,7 @@ template void test_fma() { "Expected common var_b"); // fma(a, b, fma(c, b, d)) -> fma(b, a + c, d) - auto var_d = graph::variable (1, "d"); + auto var_d = graph::variable (1, ""); auto match1 = graph::fma(var_b, var_a + var_c, var_d); auto nested_fma1 = graph::fma(var_a, var_b, graph::fma(var_c, var_b, var_d)); @@ -3074,7 +3074,7 @@ template void test_fma() { assert(chained_fma_cast2.get() && "Expected muliply node."); assert(constant_cast(chained_fma_cast2->get_left()) && "Expected constant node."); - + // fma(a,b/c,fma(d,e/c,g)) -> (a*b + d*e)/c + g auto chained_fma3 = fma(var_a, var_b/var_c, fma(var_d, var_e/var_c, var)); assert(add_cast(chained_fma3).get() && "expected add node."); -- GitLab