Loading pkgs/development/python-modules/torch/source/default.nix +2 −15 Original line number Diff line number Diff line Loading @@ -282,7 +282,7 @@ in buildPythonPackage.override { inherit stdenv; } rec { pname = "torch"; # Don't forget to update torch-bin to the same version. version = "2.8.0"; version = "2.9.0"; pyproject = true; outputs = [ Loading @@ -304,19 +304,6 @@ buildPythonPackage.override { inherit stdenv; } rec { patches = [ ./clang19-template-warning.patch # Do not override PYTHONPATH, otherwise, the build fails with: # ModuleNotFoundError: No module named 'typing_extensions' (fetchpatch { name = "cmake-build-preserve-PYTHONPATH"; url = "https://github.com/pytorch/pytorch/commit/231c72240d80091f099c95e326d3600cba866eee.patch"; hash = "sha256-BBCjxzz2TUkx4nXRyRILA82kMwyb/4+C3eOtYqf5dhk="; }) # Fixes GCC-14 compatibility on ARM # Adapted from https://github.com/pytorch/pytorch/pull/157867 # TODO: remove at the next release ./gcc-14-arm-compat.path ] ++ lib.optionals cudaSupport [ ./fix-cmake-cuda-toolkit.patch Loading @@ -336,7 +323,7 @@ buildPythonPackage.override { inherit stdenv; } rec { postPatch = '' substituteInPlace pyproject.toml \ --replace-fail "setuptools>=62.3.0,<80.0" "setuptools" --replace-fail "setuptools>=70.1.0,<80.0" "setuptools" '' # Provide path to openssl binary for inductor code cache hash # InductorError: FileNotFoundError: [Errno 2] No such file or directory: 'openssl' Loading pkgs/development/python-modules/torch/source/gcc-14-arm-compat.pathdeleted 100644 → 0 +0 −49 Original line number Diff line number Diff line diff --git a/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h b/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h index 7f05c2ad166..1632b595c4c 100644 --- a/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h +++ b/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h @@ -220,8 +220,12 @@ class Vectorized<BFloat16> { Vectorized<BFloat16> le(const Vectorized<BFloat16>& other) const; }; -inline std::tuple<Vectorized<float>, Vectorized<float>> convert_bfloat16_float( - const Vectorized<c10::BFloat16>& a) { +#if defined(__GNUC__) && __GNUC__ == 14 +// Workaround for gcc-14.2.0 ICE during RTL pass: vregs when compiling for SVE +__attribute__((optimize("no-tree-vectorize"))) +#endif +inline std::tuple<Vectorized<float>, Vectorized<float>> +convert_bfloat16_float(const Vectorized<c10::BFloat16>& a) { static_assert( Vectorized<c10::BFloat16>::size() == 2 * Vectorized<float>::size()); auto zero = svreinterpret_bf16_f32(svdup_n_f32(0.0f)); diff --git a/aten/src/ATen/native/cpu/Activation.cpp b/aten/src/ATen/native/cpu/Activation.cpp index 52d5383e60f..00c9f4eb253 100644 --- a/aten/src/ATen/native/cpu/Activation.cpp +++ b/aten/src/ATen/native/cpu/Activation.cpp @@ -26,6 +26,10 @@ namespace at::native { namespace { +#if defined(__GNUC__) && __GNUC__ == 14 && defined(__aarch64__) && !defined(__ARM_FEATURE_SVE) +// Workaround for gcc-14.2.0 ICE during RTL pass: expand when compiling for NEON +__attribute__((optimize("no-tree-vectorize"))) +#endif static void log_sigmoid_cpu_kernel(TensorBase &output, TensorBase &buffer, const TensorBase &input) { if (at::isReducedFloatingType(input.scalar_type())) { AT_DISPATCH_REDUCED_FLOATING_TYPES(input.scalar_type(), "log_sigmoid_cpu", [&]() { diff --git a/aten/src/ATen/native/cpu/Unfold2d.cpp b/aten/src/ATen/native/cpu/Unfold2d.cpp index 8ef0741e77a..8c94decfff0 100644 --- a/aten/src/ATen/native/cpu/Unfold2d.cpp +++ b/aten/src/ATen/native/cpu/Unfold2d.cpp @@ -169,6 +169,10 @@ static void unfolded2d_acc_channels_last( /* note: due to write issues, this one cannot be parallelized as well as * unfolded2d_copy */ +#if defined(__GNUC__) && __GNUC__ == 14 && defined(__ARM_FEATURE_SVE) && !defined(__ARM_FEATURE_BF16) +// Workaround for gcc-14.2.0 ICE during RTL pass: vregs when compiling for SVE without BF16 +__attribute__((optimize("no-tree-vectorize"))) +#endif void unfolded2d_acc_kernel( ScalarType dtype, void *finput_data, pkgs/development/python-modules/torch/source/nvtx3-hpp-path-fix.patch +13 −16 Original line number Diff line number Diff line diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index a93386c27f8..7c6b98006bf 100644 index ef5c2fd4e97..6591296b704 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -967,7 +967,7 @@ endif() @@ -952,15 +952,16 @@ endif() # ---[ nvtx if(USE_SYSTEM_NVTX) - find_path(nvtx3_dir NAMES nvtx3 PATHS ${CUDA_INCLUDE_DIRS}) + find_path(nvtx3_dir NAMES nvtx3/nvtx3.hpp PATHS ${CUDA_INCLUDE_DIRS}) find_package_handle_standard_args(nvtx3 DEFAULT_MSG nvtx3_dir) if(NOT nvtx3_FOUND) message(WARNING "Cannot find system NVTX3, find shipped NVTX3 instead") @@ -977,9 +977,10 @@ if(NOT TARGET CUDA::nvtx3) add_library(CUDA::nvtx3 INTERFACE IMPORTED) endif() if(NOT nvtx3_dir) else() - find_path(nvtx3_dir NAMES nvtx3 PATHS "${PROJECT_SOURCE_DIR}/third_party/NVTX/c/include" NO_DEFAULT_PATH) - target_include_directories(CUDA::nvtx3 INTERFACE "${nvtx3_dir}") + find_path(nvtx3_dir NAMES nvtx3/nvtx3.hpp PATHS "${PROJECT_SOURCE_DIR}/third_party/NVTX/c/include" NO_DEFAULT_PATH) endif() +target_include_directories(CUDA::nvtx3 INTERFACE "${nvtx3_dir}") find_package_handle_standard_args(nvtx3 DEFAULT_MSG nvtx3_dir) if(nvtx3_FOUND) add_library(torch::nvtx3 INTERFACE IMPORTED) target_include_directories(torch::nvtx3 INTERFACE "${nvtx3_dir}") target_compile_definitions(torch::nvtx3 INTERFACE TORCH_CUDA_USE_NVTX3) + message(STATUS "Using NVTX3 include directory: ${nvtx3_dir}") # ---[ HIP else() message(WARNING "Cannot find NVTX3, find old NVTX instead") add_library(torch::nvtoolsext INTERFACE IMPORTED) pkgs/development/python-modules/torch/source/src.nix +58 −38 Original line number Diff line number Diff line Loading @@ -4,13 +4,19 @@ fetchFromGitHub, runCommand, }: assert version == "2.8.0"; assert version == "2.9.0"; rec { src_aiter = fetchFromGitHub { owner = "ROCm"; repo = "aiter"; rev = "01aae101b9e5e94d6c16a9514c9fb8df99c93150"; hash = "sha256-rJwKTzUi066ZRroK7eFsOPmNRRRq7VMQz0Xw7qcgtNo="; }; src_asmjit = fetchFromGitHub { owner = "asmjit"; repo = "asmjit"; rev = "e5d7c0bd5d9aec44d68830187138149e6a8c4e32"; hash = "sha256-sI0/9szBMvopQAmVcZSU4D/oaZYdb08AHDSZKy/Qz1g="; rev = "a3199e8857792cd10b7589ff5d58343d2c9008ea"; hash = "sha256-qb0lM1N1FIvoADNsZZdlg8HAheePv/LvSDvRhOAqZc0="; }; src_benchmark = fetchFromGitHub { owner = "google"; Loading Loading @@ -45,14 +51,20 @@ rec { src_composable_kernel = fetchFromGitHub { owner = "ROCm"; repo = "composable_kernel"; rev = "8086bbe3a78d931eb96fe12fdc014082e18d18d3"; hash = "sha256-fyL1SzRs5CXW5CWy6kCN1y1xX6cG+ur7iQlbKX2zbCM="; rev = "7fe50dc3da2069d6645d9deb8c017a876472a977"; hash = "sha256-OxA0ekcaRxAmBFlXkvS7XAX40kcWCwyytHWV6vROWjo="; }; src_composable_kernel_aiter = fetchFromGitHub { owner = "ROCm"; repo = "composable_kernel"; rev = "cffe8fa2a442ac8e80dd236a1a5d24fe3d7e0cbf"; hash = "sha256-KDmSs9NDFYbyE4wzBedVDBZGhI1BAoJHWOStFkuEk9U="; }; src_composable_kernel_fbgemm = fetchFromGitHub { owner = "jwfromm"; repo = "composable_kernel"; rev = "4a61bdd4bd4ed730e078aebc7c0fcf046ff29406"; hash = "sha256-CxcpvW4QxkUDB2zMz7NB6Rt9jXjJeGVExfrYbn9ef5I="; rev = "b1281b8b08d973a7064f864f47eeb30f3e2596e9"; hash = "sha256-ZWfTZ9UxnIpkoRnTmB5e3H/LY5a2HixkCCIMcnpc+Gw="; }; src_composable_kernel_flash-attention = fetchFromGitHub { owner = "ROCm"; Loading @@ -63,8 +75,8 @@ rec { src_cpp-httplib = fetchFromGitHub { owner = "yhirose"; repo = "cpp-httplib"; rev = "3af7f2c16147f3fbc6e4d717032daf505dc1652c"; hash = "sha256-t/ddZjKelnXQdXQvZgv9pQcJt3M2rwgbtTQNW/T6Gpk="; rev = "89c932f313c6437c38f2982869beacc89c2f2246"; hash = "sha256-+VPebnFMGNyChM20q4Z+kVOyI/qDLQjRsaGS0vo8kDM="; }; src_cpr = fetchFromGitHub { owner = "libcpr"; Loading @@ -87,20 +99,20 @@ rec { src_cudnn-frontend = fetchFromGitHub { owner = "NVIDIA"; repo = "cudnn-frontend"; rev = "666996fe3960f27170d1527e5579ba24c8d3380a"; hash = "sha256-/ae5dNjqkn5dGciGSO+Pn9pzJg+hHWqqQCPcqggWezo="; rev = "f937055efc6d414d11f4c6577e3977fe74f35fb6"; hash = "sha256-LiTajW2hrDth8wEC4Vp2lZO+CeMqK+tEKPLok7gXB/s="; }; src_cutlass = fetchFromGitHub { owner = "NVIDIA"; repo = "cutlass"; rev = "ad7b2f5e84fcfa124cb02b91d5bd26d238c0459e"; hash = "sha256-teziPNA9csYvhkG5t2ht8W8x5+1YGGbHm8VKx4JoxgI="; rev = "e51efbfe18fe4f4cbb66ab814c55bf4aa0185491"; hash = "sha256-ZY+6Tg/CC6fqvU764k6QNudYDpY+s8OQklG+1aXQuns="; }; src_cutlass_fbgemm = fetchFromGitHub { owner = "jwfromm"; repo = "cutlass"; rev = "3ed8d2ec4ba35ef5d9d8353826209b6f868f63d3"; hash = "sha256-NntohGvqs6fbWusi2Qv5uzCJhMAfBv8qYoFi38D+mzk="; rev = "311f3c8e51dc0eb56310cfc6980bf63d0fbd7917"; hash = "sha256-JSr48FkrYE9mvm1+ikrqUxrYuV4Bok2EOdcyeTsMdiA="; }; src_cutlass_flash-attention = fetchFromGitHub { owner = "NVIDIA"; Loading @@ -123,8 +135,8 @@ rec { src_fbgemm = fetchFromGitHub { owner = "pytorch"; repo = "fbgemm"; rev = "157e88b750c452bef2ab4653fe9d1eeb151ce4c3"; hash = "sha256-Ka8/4gBsbtKNhKM/cWg1NmlKjVeBZvS+yS9SQQxb34A="; rev = "4b39c551efe15e6bbade20565b0ceb2d8ce3352d"; hash = "sha256-a7oNR2RMQWiaX9jLAy5Y4aniByqj7f9g65snOjPPwK0="; }; src_fbjni = fetchFromGitHub { owner = "facebookincubator"; Loading Loading @@ -201,8 +213,8 @@ rec { src_gloo = fetchFromGitHub { owner = "pytorch"; repo = "gloo"; rev = "c7b7b022c124d9643957d9bd55f57ac59fce8fa2"; hash = "sha256-pZ08gs6wQTZNVDX9uuaQZvw5JKCps8EALegNF0UVV3c="; rev = "54cbae0d3a67fa890b4c3d9ee162b7860315e341"; hash = "sha256-4g/AffVyU7iEHJI0KRkvJqeHSBMRB89V4/cfVf3yPf4="; }; src_googletest = fetchFromGitHub { owner = "google"; Loading @@ -216,12 +228,6 @@ rec { rev = "58d77fa8070e8cec2dc1ed015d66b454c8d78850"; hash = "sha256-W+OxRTVtemt2esw4P7IyGWXOonUN5ZuscjvzqkYvZbM="; }; src_googletest_fbgemm = fetchFromGitHub { owner = "google"; repo = "googletest"; rev = "f8d7d77c06936315286eb55f8de22cd23c188571"; hash = "sha256-t0RchAHTJbuI5YW4uyBPykTvcjy90JW9AOPNjIhwh6U="; }; src_googletest_kineto = fetchFromGitHub { owner = "google"; repo = "googletest"; Loading Loading @@ -261,8 +267,8 @@ rec { src_hipify_torch = fetchFromGitHub { owner = "ROCmSoftwarePlatform"; repo = "hipify_torch"; rev = "a4337c69fe0e2552a7b7b0669178926beeed828c"; hash = "sha256-B0+tDjSlZ9C5IAAgteRIgwaJNnptpp1jOP3hTF5AdOw="; rev = "63b6a7b541fa7f08f8475ca7d74054db36ff2691"; hash = "sha256-TH9fyprP21sRsxGs4VrahhFSIXDhnLvV09c+ZCE27u0="; }; src_ideep = fetchFromGitHub { owner = "intel"; Loading Loading @@ -321,14 +327,14 @@ rec { src_libuv = fetchFromGitHub { owner = "libuv"; repo = "libuv"; rev = "1dff88e5161cba5c59276d2070d2e304e4dcb242"; hash = "sha256-i6AYD1Ony0L2+3yWK6bxOfwoZEvd9qCg33QSqA7bRXI="; rev = "5152db2cbfeb5582e9c27c5ea1dba2cd9e10759b"; hash = "sha256-ayTk3qkeeAjrGj5ab7wF7vpWI8XWS1EeKKUqzaD/LY0="; }; src_mimalloc = fetchFromGitHub { owner = "microsoft"; repo = "mimalloc"; rev = "94036de6fe20bfd8a73d4a6d142fcf532ea604d9"; hash = "sha256-B0gngv16WFLBtrtG5NqA2m5e95bYVcQraeITcOX9A74="; rev = "fbd8b99c2b828428947d70fdc046bb55609be93e"; hash = "sha256-+8xZT+mVEqlqabQc+1buVH/X6FZxvCd0rWMyjPu9i4o="; }; src_mkl-dnn = fetchFromGitHub { owner = "intel"; Loading Loading @@ -415,6 +421,12 @@ rec { hash = "sha256-R4YmNzWEELSkAws/ejmNVxqXDTJwcqjLU/o/HvgRn2E="; }; src_pybind11 = fetchFromGitHub { owner = "pybind"; repo = "pybind11"; rev = "f5fbe867d2d26e4a0a9177a51f6e568868ad3dc8"; hash = "sha256-ZiwNGsE1FOkhnWv/1ib1akhQ4FZvrXRCDnnBZoPp6r4="; }; src_pybind11_onnx = fetchFromGitHub { owner = "pybind"; repo = "pybind11"; rev = "a2e59f0e7065404b44dfe92a28aca47ba1378dc4"; Loading @@ -429,8 +441,8 @@ rec { src_pytorch = fetchFromGitHub { owner = "pytorch"; repo = "pytorch"; rev = "v2.8.0"; hash = "sha256-tFEpcgj0HiJcyBiZMtIrBrnmiCJApfTC1BgOXEGvqCo="; rev = "v2.9.0"; hash = "sha256-0NdREKn9h3FtHKVe1Z3QtSOVdEcfgLlWXG/OiI+QrwA="; }; src_sleef = fetchFromGitHub { owner = "shibatch"; Loading @@ -441,8 +453,8 @@ rec { src_tensorpipe = fetchFromGitHub { owner = "pytorch"; repo = "tensorpipe"; rev = "52791a2fd214b2a9dc5759d36725909c1daa7f2e"; hash = "sha256-i+CtjNFPDUzFCPxP0//jMLJDrQoorg0On9NfoVaMUxI="; rev = "af0118d13e52f5a08841464a768e01a0bf3e3075"; hash = "sha256-X2YfYfDKxG0i2K8Uf7gpSb+LU2y0d4VBZM3lTa/ff1w="; }; src_vcpkg = fetchFromGitHub { owner = "Microsoft"; Loading @@ -462,6 +474,11 @@ rec { rev = "51a0103656eff6fc9bfd39a4597923c4b542c883"; hash = "sha256-nhowllqv/hBs7xHdTwbWtiKJ1mvAYsVIyIZ35ZGsmkg="; }; src_aiter_recursive = runCommand "aiter" { } '' cp -r ${src_aiter} $out chmod u+w $out/3rdparty/composable_kernel cp -r ${src_composable_kernel_aiter_recursive}/* $out/3rdparty/composable_kernel ''; src_asmjit_recursive = src_asmjit; src_benchmark_recursive = src_benchmark; src_benchmark_opentelemetry-cpp_recursive = src_benchmark_opentelemetry-cpp; Loading @@ -469,6 +486,7 @@ rec { src_civetweb_recursive = src_civetweb; src_clang-cindex-python3_recursive = src_clang-cindex-python3; src_composable_kernel_recursive = src_composable_kernel; src_composable_kernel_aiter_recursive = src_composable_kernel_aiter; src_composable_kernel_fbgemm_recursive = src_composable_kernel_fbgemm; src_composable_kernel_flash-attention_recursive = src_composable_kernel_flash-attention; src_cpp-httplib_recursive = src_cpp-httplib; Loading Loading @@ -510,7 +528,7 @@ rec { chmod u+w $out/external/cutlass cp -r ${src_cutlass_fbgemm_recursive}/* $out/external/cutlass chmod u+w $out/external/googletest cp -r ${src_googletest_fbgemm_recursive}/* $out/external/googletest cp -r ${src_googletest_recursive}/* $out/external/googletest chmod u+w $out/external/hipify_torch cp -r ${src_hipify_torch_recursive}/* $out/external/hipify_torch chmod u+w $out/external/json Loading Loading @@ -541,7 +559,6 @@ rec { src_gloo_recursive = src_gloo; src_googletest_recursive = src_googletest; src_googletest_dynolog_recursive = src_googletest_dynolog; src_googletest_fbgemm_recursive = src_googletest_fbgemm; src_googletest_kineto_recursive = src_googletest_kineto; src_googletest_opentelemetry-cpp_recursive = src_googletest_opentelemetry-cpp; src_googletest_prometheus-cpp_recursive = src_googletest_prometheus-cpp; Loading Loading @@ -578,7 +595,7 @@ rec { src_onnx_recursive = runCommand "onnx" { } '' cp -r ${src_onnx} $out chmod u+w $out/third_party/pybind11 cp -r ${src_pybind11_recursive}/* $out/third_party/pybind11 cp -r ${src_pybind11_onnx_recursive}/* $out/third_party/pybind11 ''; src_opentelemetry-cpp_recursive = runCommand "opentelemetry-cpp" { } '' cp -r ${src_opentelemetry-cpp} $out Loading Loading @@ -621,6 +638,7 @@ rec { src_psimd_recursive = src_psimd; src_pthreadpool_recursive = src_pthreadpool; src_pybind11_recursive = src_pybind11; src_pybind11_onnx_recursive = src_pybind11_onnx; src_pybind11_tensorpipe_recursive = runCommand "pybind11_tensorpipe" { } '' cp -r ${src_pybind11_tensorpipe} $out chmod u+w $out/tools/clang Loading @@ -630,6 +648,8 @@ rec { cp -r ${src_pytorch} $out chmod u+w $out/android/libs/fbjni cp -r ${src_fbjni_recursive}/* $out/android/libs/fbjni chmod u+w $out/third_party/aiter cp -r ${src_aiter_recursive}/* $out/third_party/aiter chmod u+w $out/third_party/benchmark cp -r ${src_benchmark_recursive}/* $out/third_party/benchmark chmod u+w $out/third_party/composable_kernel Loading pkgs/development/python-modules/torchaudio/default.nix +2 −2 Original line number Diff line number Diff line Loading @@ -77,7 +77,7 @@ let in buildPythonPackage rec { pname = "torchaudio"; version = "2.8.0"; version = "2.9.0"; pyproject = true; stdenv = torch.stdenv; Loading @@ -86,7 +86,7 @@ buildPythonPackage rec { owner = "pytorch"; repo = "audio"; tag = "v${version}"; hash = "sha256-SPa6ZWA2AWawfL4Z4mb1nddGaAsGEl/0dwweBpex2Wo="; hash = "sha256-oZTe0LWqOJ0NUxmmUKZN3GhMgloOMCYMicbYoaW2pTw="; }; patches = [ Loading Loading
pkgs/development/python-modules/torch/source/default.nix +2 −15 Original line number Diff line number Diff line Loading @@ -282,7 +282,7 @@ in buildPythonPackage.override { inherit stdenv; } rec { pname = "torch"; # Don't forget to update torch-bin to the same version. version = "2.8.0"; version = "2.9.0"; pyproject = true; outputs = [ Loading @@ -304,19 +304,6 @@ buildPythonPackage.override { inherit stdenv; } rec { patches = [ ./clang19-template-warning.patch # Do not override PYTHONPATH, otherwise, the build fails with: # ModuleNotFoundError: No module named 'typing_extensions' (fetchpatch { name = "cmake-build-preserve-PYTHONPATH"; url = "https://github.com/pytorch/pytorch/commit/231c72240d80091f099c95e326d3600cba866eee.patch"; hash = "sha256-BBCjxzz2TUkx4nXRyRILA82kMwyb/4+C3eOtYqf5dhk="; }) # Fixes GCC-14 compatibility on ARM # Adapted from https://github.com/pytorch/pytorch/pull/157867 # TODO: remove at the next release ./gcc-14-arm-compat.path ] ++ lib.optionals cudaSupport [ ./fix-cmake-cuda-toolkit.patch Loading @@ -336,7 +323,7 @@ buildPythonPackage.override { inherit stdenv; } rec { postPatch = '' substituteInPlace pyproject.toml \ --replace-fail "setuptools>=62.3.0,<80.0" "setuptools" --replace-fail "setuptools>=70.1.0,<80.0" "setuptools" '' # Provide path to openssl binary for inductor code cache hash # InductorError: FileNotFoundError: [Errno 2] No such file or directory: 'openssl' Loading
pkgs/development/python-modules/torch/source/gcc-14-arm-compat.pathdeleted 100644 → 0 +0 −49 Original line number Diff line number Diff line diff --git a/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h b/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h index 7f05c2ad166..1632b595c4c 100644 --- a/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h +++ b/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h @@ -220,8 +220,12 @@ class Vectorized<BFloat16> { Vectorized<BFloat16> le(const Vectorized<BFloat16>& other) const; }; -inline std::tuple<Vectorized<float>, Vectorized<float>> convert_bfloat16_float( - const Vectorized<c10::BFloat16>& a) { +#if defined(__GNUC__) && __GNUC__ == 14 +// Workaround for gcc-14.2.0 ICE during RTL pass: vregs when compiling for SVE +__attribute__((optimize("no-tree-vectorize"))) +#endif +inline std::tuple<Vectorized<float>, Vectorized<float>> +convert_bfloat16_float(const Vectorized<c10::BFloat16>& a) { static_assert( Vectorized<c10::BFloat16>::size() == 2 * Vectorized<float>::size()); auto zero = svreinterpret_bf16_f32(svdup_n_f32(0.0f)); diff --git a/aten/src/ATen/native/cpu/Activation.cpp b/aten/src/ATen/native/cpu/Activation.cpp index 52d5383e60f..00c9f4eb253 100644 --- a/aten/src/ATen/native/cpu/Activation.cpp +++ b/aten/src/ATen/native/cpu/Activation.cpp @@ -26,6 +26,10 @@ namespace at::native { namespace { +#if defined(__GNUC__) && __GNUC__ == 14 && defined(__aarch64__) && !defined(__ARM_FEATURE_SVE) +// Workaround for gcc-14.2.0 ICE during RTL pass: expand when compiling for NEON +__attribute__((optimize("no-tree-vectorize"))) +#endif static void log_sigmoid_cpu_kernel(TensorBase &output, TensorBase &buffer, const TensorBase &input) { if (at::isReducedFloatingType(input.scalar_type())) { AT_DISPATCH_REDUCED_FLOATING_TYPES(input.scalar_type(), "log_sigmoid_cpu", [&]() { diff --git a/aten/src/ATen/native/cpu/Unfold2d.cpp b/aten/src/ATen/native/cpu/Unfold2d.cpp index 8ef0741e77a..8c94decfff0 100644 --- a/aten/src/ATen/native/cpu/Unfold2d.cpp +++ b/aten/src/ATen/native/cpu/Unfold2d.cpp @@ -169,6 +169,10 @@ static void unfolded2d_acc_channels_last( /* note: due to write issues, this one cannot be parallelized as well as * unfolded2d_copy */ +#if defined(__GNUC__) && __GNUC__ == 14 && defined(__ARM_FEATURE_SVE) && !defined(__ARM_FEATURE_BF16) +// Workaround for gcc-14.2.0 ICE during RTL pass: vregs when compiling for SVE without BF16 +__attribute__((optimize("no-tree-vectorize"))) +#endif void unfolded2d_acc_kernel( ScalarType dtype, void *finput_data,
pkgs/development/python-modules/torch/source/nvtx3-hpp-path-fix.patch +13 −16 Original line number Diff line number Diff line diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index a93386c27f8..7c6b98006bf 100644 index ef5c2fd4e97..6591296b704 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -967,7 +967,7 @@ endif() @@ -952,15 +952,16 @@ endif() # ---[ nvtx if(USE_SYSTEM_NVTX) - find_path(nvtx3_dir NAMES nvtx3 PATHS ${CUDA_INCLUDE_DIRS}) + find_path(nvtx3_dir NAMES nvtx3/nvtx3.hpp PATHS ${CUDA_INCLUDE_DIRS}) find_package_handle_standard_args(nvtx3 DEFAULT_MSG nvtx3_dir) if(NOT nvtx3_FOUND) message(WARNING "Cannot find system NVTX3, find shipped NVTX3 instead") @@ -977,9 +977,10 @@ if(NOT TARGET CUDA::nvtx3) add_library(CUDA::nvtx3 INTERFACE IMPORTED) endif() if(NOT nvtx3_dir) else() - find_path(nvtx3_dir NAMES nvtx3 PATHS "${PROJECT_SOURCE_DIR}/third_party/NVTX/c/include" NO_DEFAULT_PATH) - target_include_directories(CUDA::nvtx3 INTERFACE "${nvtx3_dir}") + find_path(nvtx3_dir NAMES nvtx3/nvtx3.hpp PATHS "${PROJECT_SOURCE_DIR}/third_party/NVTX/c/include" NO_DEFAULT_PATH) endif() +target_include_directories(CUDA::nvtx3 INTERFACE "${nvtx3_dir}") find_package_handle_standard_args(nvtx3 DEFAULT_MSG nvtx3_dir) if(nvtx3_FOUND) add_library(torch::nvtx3 INTERFACE IMPORTED) target_include_directories(torch::nvtx3 INTERFACE "${nvtx3_dir}") target_compile_definitions(torch::nvtx3 INTERFACE TORCH_CUDA_USE_NVTX3) + message(STATUS "Using NVTX3 include directory: ${nvtx3_dir}") # ---[ HIP else() message(WARNING "Cannot find NVTX3, find old NVTX instead") add_library(torch::nvtoolsext INTERFACE IMPORTED)
pkgs/development/python-modules/torch/source/src.nix +58 −38 Original line number Diff line number Diff line Loading @@ -4,13 +4,19 @@ fetchFromGitHub, runCommand, }: assert version == "2.8.0"; assert version == "2.9.0"; rec { src_aiter = fetchFromGitHub { owner = "ROCm"; repo = "aiter"; rev = "01aae101b9e5e94d6c16a9514c9fb8df99c93150"; hash = "sha256-rJwKTzUi066ZRroK7eFsOPmNRRRq7VMQz0Xw7qcgtNo="; }; src_asmjit = fetchFromGitHub { owner = "asmjit"; repo = "asmjit"; rev = "e5d7c0bd5d9aec44d68830187138149e6a8c4e32"; hash = "sha256-sI0/9szBMvopQAmVcZSU4D/oaZYdb08AHDSZKy/Qz1g="; rev = "a3199e8857792cd10b7589ff5d58343d2c9008ea"; hash = "sha256-qb0lM1N1FIvoADNsZZdlg8HAheePv/LvSDvRhOAqZc0="; }; src_benchmark = fetchFromGitHub { owner = "google"; Loading Loading @@ -45,14 +51,20 @@ rec { src_composable_kernel = fetchFromGitHub { owner = "ROCm"; repo = "composable_kernel"; rev = "8086bbe3a78d931eb96fe12fdc014082e18d18d3"; hash = "sha256-fyL1SzRs5CXW5CWy6kCN1y1xX6cG+ur7iQlbKX2zbCM="; rev = "7fe50dc3da2069d6645d9deb8c017a876472a977"; hash = "sha256-OxA0ekcaRxAmBFlXkvS7XAX40kcWCwyytHWV6vROWjo="; }; src_composable_kernel_aiter = fetchFromGitHub { owner = "ROCm"; repo = "composable_kernel"; rev = "cffe8fa2a442ac8e80dd236a1a5d24fe3d7e0cbf"; hash = "sha256-KDmSs9NDFYbyE4wzBedVDBZGhI1BAoJHWOStFkuEk9U="; }; src_composable_kernel_fbgemm = fetchFromGitHub { owner = "jwfromm"; repo = "composable_kernel"; rev = "4a61bdd4bd4ed730e078aebc7c0fcf046ff29406"; hash = "sha256-CxcpvW4QxkUDB2zMz7NB6Rt9jXjJeGVExfrYbn9ef5I="; rev = "b1281b8b08d973a7064f864f47eeb30f3e2596e9"; hash = "sha256-ZWfTZ9UxnIpkoRnTmB5e3H/LY5a2HixkCCIMcnpc+Gw="; }; src_composable_kernel_flash-attention = fetchFromGitHub { owner = "ROCm"; Loading @@ -63,8 +75,8 @@ rec { src_cpp-httplib = fetchFromGitHub { owner = "yhirose"; repo = "cpp-httplib"; rev = "3af7f2c16147f3fbc6e4d717032daf505dc1652c"; hash = "sha256-t/ddZjKelnXQdXQvZgv9pQcJt3M2rwgbtTQNW/T6Gpk="; rev = "89c932f313c6437c38f2982869beacc89c2f2246"; hash = "sha256-+VPebnFMGNyChM20q4Z+kVOyI/qDLQjRsaGS0vo8kDM="; }; src_cpr = fetchFromGitHub { owner = "libcpr"; Loading @@ -87,20 +99,20 @@ rec { src_cudnn-frontend = fetchFromGitHub { owner = "NVIDIA"; repo = "cudnn-frontend"; rev = "666996fe3960f27170d1527e5579ba24c8d3380a"; hash = "sha256-/ae5dNjqkn5dGciGSO+Pn9pzJg+hHWqqQCPcqggWezo="; rev = "f937055efc6d414d11f4c6577e3977fe74f35fb6"; hash = "sha256-LiTajW2hrDth8wEC4Vp2lZO+CeMqK+tEKPLok7gXB/s="; }; src_cutlass = fetchFromGitHub { owner = "NVIDIA"; repo = "cutlass"; rev = "ad7b2f5e84fcfa124cb02b91d5bd26d238c0459e"; hash = "sha256-teziPNA9csYvhkG5t2ht8W8x5+1YGGbHm8VKx4JoxgI="; rev = "e51efbfe18fe4f4cbb66ab814c55bf4aa0185491"; hash = "sha256-ZY+6Tg/CC6fqvU764k6QNudYDpY+s8OQklG+1aXQuns="; }; src_cutlass_fbgemm = fetchFromGitHub { owner = "jwfromm"; repo = "cutlass"; rev = "3ed8d2ec4ba35ef5d9d8353826209b6f868f63d3"; hash = "sha256-NntohGvqs6fbWusi2Qv5uzCJhMAfBv8qYoFi38D+mzk="; rev = "311f3c8e51dc0eb56310cfc6980bf63d0fbd7917"; hash = "sha256-JSr48FkrYE9mvm1+ikrqUxrYuV4Bok2EOdcyeTsMdiA="; }; src_cutlass_flash-attention = fetchFromGitHub { owner = "NVIDIA"; Loading @@ -123,8 +135,8 @@ rec { src_fbgemm = fetchFromGitHub { owner = "pytorch"; repo = "fbgemm"; rev = "157e88b750c452bef2ab4653fe9d1eeb151ce4c3"; hash = "sha256-Ka8/4gBsbtKNhKM/cWg1NmlKjVeBZvS+yS9SQQxb34A="; rev = "4b39c551efe15e6bbade20565b0ceb2d8ce3352d"; hash = "sha256-a7oNR2RMQWiaX9jLAy5Y4aniByqj7f9g65snOjPPwK0="; }; src_fbjni = fetchFromGitHub { owner = "facebookincubator"; Loading Loading @@ -201,8 +213,8 @@ rec { src_gloo = fetchFromGitHub { owner = "pytorch"; repo = "gloo"; rev = "c7b7b022c124d9643957d9bd55f57ac59fce8fa2"; hash = "sha256-pZ08gs6wQTZNVDX9uuaQZvw5JKCps8EALegNF0UVV3c="; rev = "54cbae0d3a67fa890b4c3d9ee162b7860315e341"; hash = "sha256-4g/AffVyU7iEHJI0KRkvJqeHSBMRB89V4/cfVf3yPf4="; }; src_googletest = fetchFromGitHub { owner = "google"; Loading @@ -216,12 +228,6 @@ rec { rev = "58d77fa8070e8cec2dc1ed015d66b454c8d78850"; hash = "sha256-W+OxRTVtemt2esw4P7IyGWXOonUN5ZuscjvzqkYvZbM="; }; src_googletest_fbgemm = fetchFromGitHub { owner = "google"; repo = "googletest"; rev = "f8d7d77c06936315286eb55f8de22cd23c188571"; hash = "sha256-t0RchAHTJbuI5YW4uyBPykTvcjy90JW9AOPNjIhwh6U="; }; src_googletest_kineto = fetchFromGitHub { owner = "google"; repo = "googletest"; Loading Loading @@ -261,8 +267,8 @@ rec { src_hipify_torch = fetchFromGitHub { owner = "ROCmSoftwarePlatform"; repo = "hipify_torch"; rev = "a4337c69fe0e2552a7b7b0669178926beeed828c"; hash = "sha256-B0+tDjSlZ9C5IAAgteRIgwaJNnptpp1jOP3hTF5AdOw="; rev = "63b6a7b541fa7f08f8475ca7d74054db36ff2691"; hash = "sha256-TH9fyprP21sRsxGs4VrahhFSIXDhnLvV09c+ZCE27u0="; }; src_ideep = fetchFromGitHub { owner = "intel"; Loading Loading @@ -321,14 +327,14 @@ rec { src_libuv = fetchFromGitHub { owner = "libuv"; repo = "libuv"; rev = "1dff88e5161cba5c59276d2070d2e304e4dcb242"; hash = "sha256-i6AYD1Ony0L2+3yWK6bxOfwoZEvd9qCg33QSqA7bRXI="; rev = "5152db2cbfeb5582e9c27c5ea1dba2cd9e10759b"; hash = "sha256-ayTk3qkeeAjrGj5ab7wF7vpWI8XWS1EeKKUqzaD/LY0="; }; src_mimalloc = fetchFromGitHub { owner = "microsoft"; repo = "mimalloc"; rev = "94036de6fe20bfd8a73d4a6d142fcf532ea604d9"; hash = "sha256-B0gngv16WFLBtrtG5NqA2m5e95bYVcQraeITcOX9A74="; rev = "fbd8b99c2b828428947d70fdc046bb55609be93e"; hash = "sha256-+8xZT+mVEqlqabQc+1buVH/X6FZxvCd0rWMyjPu9i4o="; }; src_mkl-dnn = fetchFromGitHub { owner = "intel"; Loading Loading @@ -415,6 +421,12 @@ rec { hash = "sha256-R4YmNzWEELSkAws/ejmNVxqXDTJwcqjLU/o/HvgRn2E="; }; src_pybind11 = fetchFromGitHub { owner = "pybind"; repo = "pybind11"; rev = "f5fbe867d2d26e4a0a9177a51f6e568868ad3dc8"; hash = "sha256-ZiwNGsE1FOkhnWv/1ib1akhQ4FZvrXRCDnnBZoPp6r4="; }; src_pybind11_onnx = fetchFromGitHub { owner = "pybind"; repo = "pybind11"; rev = "a2e59f0e7065404b44dfe92a28aca47ba1378dc4"; Loading @@ -429,8 +441,8 @@ rec { src_pytorch = fetchFromGitHub { owner = "pytorch"; repo = "pytorch"; rev = "v2.8.0"; hash = "sha256-tFEpcgj0HiJcyBiZMtIrBrnmiCJApfTC1BgOXEGvqCo="; rev = "v2.9.0"; hash = "sha256-0NdREKn9h3FtHKVe1Z3QtSOVdEcfgLlWXG/OiI+QrwA="; }; src_sleef = fetchFromGitHub { owner = "shibatch"; Loading @@ -441,8 +453,8 @@ rec { src_tensorpipe = fetchFromGitHub { owner = "pytorch"; repo = "tensorpipe"; rev = "52791a2fd214b2a9dc5759d36725909c1daa7f2e"; hash = "sha256-i+CtjNFPDUzFCPxP0//jMLJDrQoorg0On9NfoVaMUxI="; rev = "af0118d13e52f5a08841464a768e01a0bf3e3075"; hash = "sha256-X2YfYfDKxG0i2K8Uf7gpSb+LU2y0d4VBZM3lTa/ff1w="; }; src_vcpkg = fetchFromGitHub { owner = "Microsoft"; Loading @@ -462,6 +474,11 @@ rec { rev = "51a0103656eff6fc9bfd39a4597923c4b542c883"; hash = "sha256-nhowllqv/hBs7xHdTwbWtiKJ1mvAYsVIyIZ35ZGsmkg="; }; src_aiter_recursive = runCommand "aiter" { } '' cp -r ${src_aiter} $out chmod u+w $out/3rdparty/composable_kernel cp -r ${src_composable_kernel_aiter_recursive}/* $out/3rdparty/composable_kernel ''; src_asmjit_recursive = src_asmjit; src_benchmark_recursive = src_benchmark; src_benchmark_opentelemetry-cpp_recursive = src_benchmark_opentelemetry-cpp; Loading @@ -469,6 +486,7 @@ rec { src_civetweb_recursive = src_civetweb; src_clang-cindex-python3_recursive = src_clang-cindex-python3; src_composable_kernel_recursive = src_composable_kernel; src_composable_kernel_aiter_recursive = src_composable_kernel_aiter; src_composable_kernel_fbgemm_recursive = src_composable_kernel_fbgemm; src_composable_kernel_flash-attention_recursive = src_composable_kernel_flash-attention; src_cpp-httplib_recursive = src_cpp-httplib; Loading Loading @@ -510,7 +528,7 @@ rec { chmod u+w $out/external/cutlass cp -r ${src_cutlass_fbgemm_recursive}/* $out/external/cutlass chmod u+w $out/external/googletest cp -r ${src_googletest_fbgemm_recursive}/* $out/external/googletest cp -r ${src_googletest_recursive}/* $out/external/googletest chmod u+w $out/external/hipify_torch cp -r ${src_hipify_torch_recursive}/* $out/external/hipify_torch chmod u+w $out/external/json Loading Loading @@ -541,7 +559,6 @@ rec { src_gloo_recursive = src_gloo; src_googletest_recursive = src_googletest; src_googletest_dynolog_recursive = src_googletest_dynolog; src_googletest_fbgemm_recursive = src_googletest_fbgemm; src_googletest_kineto_recursive = src_googletest_kineto; src_googletest_opentelemetry-cpp_recursive = src_googletest_opentelemetry-cpp; src_googletest_prometheus-cpp_recursive = src_googletest_prometheus-cpp; Loading Loading @@ -578,7 +595,7 @@ rec { src_onnx_recursive = runCommand "onnx" { } '' cp -r ${src_onnx} $out chmod u+w $out/third_party/pybind11 cp -r ${src_pybind11_recursive}/* $out/third_party/pybind11 cp -r ${src_pybind11_onnx_recursive}/* $out/third_party/pybind11 ''; src_opentelemetry-cpp_recursive = runCommand "opentelemetry-cpp" { } '' cp -r ${src_opentelemetry-cpp} $out Loading Loading @@ -621,6 +638,7 @@ rec { src_psimd_recursive = src_psimd; src_pthreadpool_recursive = src_pthreadpool; src_pybind11_recursive = src_pybind11; src_pybind11_onnx_recursive = src_pybind11_onnx; src_pybind11_tensorpipe_recursive = runCommand "pybind11_tensorpipe" { } '' cp -r ${src_pybind11_tensorpipe} $out chmod u+w $out/tools/clang Loading @@ -630,6 +648,8 @@ rec { cp -r ${src_pytorch} $out chmod u+w $out/android/libs/fbjni cp -r ${src_fbjni_recursive}/* $out/android/libs/fbjni chmod u+w $out/third_party/aiter cp -r ${src_aiter_recursive}/* $out/third_party/aiter chmod u+w $out/third_party/benchmark cp -r ${src_benchmark_recursive}/* $out/third_party/benchmark chmod u+w $out/third_party/composable_kernel Loading
pkgs/development/python-modules/torchaudio/default.nix +2 −2 Original line number Diff line number Diff line Loading @@ -77,7 +77,7 @@ let in buildPythonPackage rec { pname = "torchaudio"; version = "2.8.0"; version = "2.9.0"; pyproject = true; stdenv = torch.stdenv; Loading @@ -86,7 +86,7 @@ buildPythonPackage rec { owner = "pytorch"; repo = "audio"; tag = "v${version}"; hash = "sha256-SPa6ZWA2AWawfL4Z4mb1nddGaAsGEl/0dwweBpex2Wo="; hash = "sha256-oZTe0LWqOJ0NUxmmUKZN3GhMgloOMCYMicbYoaW2pTw="; }; patches = [ Loading