Loading pkgs/development/python-modules/torch/source/default.nix +5 −0 Original line number Diff line number Diff line Loading @@ -306,6 +306,11 @@ buildPythonPackage rec { url = "https://github.com/pytorch/pytorch/commit/231c72240d80091f099c95e326d3600cba866eee.patch"; hash = "sha256-BBCjxzz2TUkx4nXRyRILA82kMwyb/4+C3eOtYqf5dhk="; }) # Fixes GCC-14 compatibility on ARM # Adapted from https://github.com/pytorch/pytorch/pull/157867 # TODO: remove at the next release ./gcc-14-arm-compat.path ] ++ lib.optionals cudaSupport [ ./fix-cmake-cuda-toolkit.patch Loading pkgs/development/python-modules/torch/source/gcc-14-arm-compat.path 0 → 100644 +49 −0 Original line number Diff line number Diff line diff --git a/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h b/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h index 7f05c2ad166..1632b595c4c 100644 --- a/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h +++ b/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h @@ -220,8 +220,12 @@ class Vectorized<BFloat16> { Vectorized<BFloat16> le(const Vectorized<BFloat16>& other) const; }; -inline std::tuple<Vectorized<float>, Vectorized<float>> convert_bfloat16_float( - const Vectorized<c10::BFloat16>& a) { +#if defined(__GNUC__) && __GNUC__ == 14 +// Workaround for gcc-14.2.0 ICE during RTL pass: vregs when compiling for SVE +__attribute__((optimize("no-tree-vectorize"))) +#endif +inline std::tuple<Vectorized<float>, Vectorized<float>> +convert_bfloat16_float(const Vectorized<c10::BFloat16>& a) { static_assert( Vectorized<c10::BFloat16>::size() == 2 * Vectorized<float>::size()); auto zero = svreinterpret_bf16_f32(svdup_n_f32(0.0f)); diff --git a/aten/src/ATen/native/cpu/Activation.cpp b/aten/src/ATen/native/cpu/Activation.cpp index 52d5383e60f..00c9f4eb253 100644 --- a/aten/src/ATen/native/cpu/Activation.cpp +++ b/aten/src/ATen/native/cpu/Activation.cpp @@ -26,6 +26,10 @@ namespace at::native { namespace { +#if defined(__GNUC__) && __GNUC__ == 14 && defined(__aarch64__) && !defined(__ARM_FEATURE_SVE) +// Workaround for gcc-14.2.0 ICE during RTL pass: expand when compiling for NEON +__attribute__((optimize("no-tree-vectorize"))) +#endif static void log_sigmoid_cpu_kernel(TensorBase &output, TensorBase &buffer, const TensorBase &input) { if (at::isReducedFloatingType(input.scalar_type())) { AT_DISPATCH_REDUCED_FLOATING_TYPES(input.scalar_type(), "log_sigmoid_cpu", [&]() { diff --git a/aten/src/ATen/native/cpu/Unfold2d.cpp b/aten/src/ATen/native/cpu/Unfold2d.cpp index 8ef0741e77a..8c94decfff0 100644 --- a/aten/src/ATen/native/cpu/Unfold2d.cpp +++ b/aten/src/ATen/native/cpu/Unfold2d.cpp @@ -169,6 +169,10 @@ static void unfolded2d_acc_channels_last( /* note: due to write issues, this one cannot be parallelized as well as * unfolded2d_copy */ +#if defined(__GNUC__) && __GNUC__ == 14 && defined(__ARM_FEATURE_SVE) && !defined(__ARM_FEATURE_BF16) +// Workaround for gcc-14.2.0 ICE during RTL pass: vregs when compiling for SVE without BF16 +__attribute__((optimize("no-tree-vectorize"))) +#endif void unfolded2d_acc_kernel( ScalarType dtype, void *finput_data, Loading
pkgs/development/python-modules/torch/source/default.nix +5 −0 Original line number Diff line number Diff line Loading @@ -306,6 +306,11 @@ buildPythonPackage rec { url = "https://github.com/pytorch/pytorch/commit/231c72240d80091f099c95e326d3600cba866eee.patch"; hash = "sha256-BBCjxzz2TUkx4nXRyRILA82kMwyb/4+C3eOtYqf5dhk="; }) # Fixes GCC-14 compatibility on ARM # Adapted from https://github.com/pytorch/pytorch/pull/157867 # TODO: remove at the next release ./gcc-14-arm-compat.path ] ++ lib.optionals cudaSupport [ ./fix-cmake-cuda-toolkit.patch Loading
pkgs/development/python-modules/torch/source/gcc-14-arm-compat.path 0 → 100644 +49 −0 Original line number Diff line number Diff line diff --git a/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h b/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h index 7f05c2ad166..1632b595c4c 100644 --- a/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h +++ b/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h @@ -220,8 +220,12 @@ class Vectorized<BFloat16> { Vectorized<BFloat16> le(const Vectorized<BFloat16>& other) const; }; -inline std::tuple<Vectorized<float>, Vectorized<float>> convert_bfloat16_float( - const Vectorized<c10::BFloat16>& a) { +#if defined(__GNUC__) && __GNUC__ == 14 +// Workaround for gcc-14.2.0 ICE during RTL pass: vregs when compiling for SVE +__attribute__((optimize("no-tree-vectorize"))) +#endif +inline std::tuple<Vectorized<float>, Vectorized<float>> +convert_bfloat16_float(const Vectorized<c10::BFloat16>& a) { static_assert( Vectorized<c10::BFloat16>::size() == 2 * Vectorized<float>::size()); auto zero = svreinterpret_bf16_f32(svdup_n_f32(0.0f)); diff --git a/aten/src/ATen/native/cpu/Activation.cpp b/aten/src/ATen/native/cpu/Activation.cpp index 52d5383e60f..00c9f4eb253 100644 --- a/aten/src/ATen/native/cpu/Activation.cpp +++ b/aten/src/ATen/native/cpu/Activation.cpp @@ -26,6 +26,10 @@ namespace at::native { namespace { +#if defined(__GNUC__) && __GNUC__ == 14 && defined(__aarch64__) && !defined(__ARM_FEATURE_SVE) +// Workaround for gcc-14.2.0 ICE during RTL pass: expand when compiling for NEON +__attribute__((optimize("no-tree-vectorize"))) +#endif static void log_sigmoid_cpu_kernel(TensorBase &output, TensorBase &buffer, const TensorBase &input) { if (at::isReducedFloatingType(input.scalar_type())) { AT_DISPATCH_REDUCED_FLOATING_TYPES(input.scalar_type(), "log_sigmoid_cpu", [&]() { diff --git a/aten/src/ATen/native/cpu/Unfold2d.cpp b/aten/src/ATen/native/cpu/Unfold2d.cpp index 8ef0741e77a..8c94decfff0 100644 --- a/aten/src/ATen/native/cpu/Unfold2d.cpp +++ b/aten/src/ATen/native/cpu/Unfold2d.cpp @@ -169,6 +169,10 @@ static void unfolded2d_acc_channels_last( /* note: due to write issues, this one cannot be parallelized as well as * unfolded2d_copy */ +#if defined(__GNUC__) && __GNUC__ == 14 && defined(__ARM_FEATURE_SVE) && !defined(__ARM_FEATURE_BF16) +// Workaround for gcc-14.2.0 ICE during RTL pass: vregs when compiling for SVE without BF16 +__attribute__((optimize("no-tree-vectorize"))) +#endif void unfolded2d_acc_kernel( ScalarType dtype, void *finput_data,