python3Packages.torch: fix on aarch64-linux (#439489) (6f415c1e) · Commits · nix / nixpkgs

pkgs/development/python-modules/torch/source/default.nix

+5 −0

Original line number	Diff line number	Diff line
		@@ -306,6 +306,11 @@ buildPythonPackage rec {
		url = "https://github.com/pytorch/pytorch/commit/231c72240d80091f099c95e326d3600cba866eee.patch";
		hash = "sha256-BBCjxzz2TUkx4nXRyRILA82kMwyb/4+C3eOtYqf5dhk=";
		})

		# Fixes GCC-14 compatibility on ARM
		# Adapted from https://github.com/pytorch/pytorch/pull/157867
		# TODO: remove at the next release
		./gcc-14-arm-compat.path
		]
		++ lib.optionals cudaSupport [
		./fix-cmake-cuda-toolkit.patch

pkgs/development/python-modules/torch/source/gcc-14-arm-compat.path

0 → 100644

+49 −0

Original line number	Diff line number	Diff line
		diff --git a/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h b/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h
		index 7f05c2ad166..1632b595c4c 100644
		--- a/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h
		+++ b/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h
		@@ -220,8 +220,12 @@ class Vectorized<BFloat16> {
		Vectorized<BFloat16> le(const Vectorized<BFloat16>& other) const;
		};

		-inline std::tuple<Vectorized<float>, Vectorized<float>> convert_bfloat16_float(
		- const Vectorized<c10::BFloat16>& a) {
		+#if defined(__GNUC__) && __GNUC__ == 14
		+// Workaround for gcc-14.2.0 ICE during RTL pass: vregs when compiling for SVE
		+__attribute__((optimize("no-tree-vectorize")))
		+#endif
		+inline std::tuple<Vectorized<float>, Vectorized<float>>
		+convert_bfloat16_float(const Vectorized<c10::BFloat16>& a) {
		static_assert(
		Vectorized<c10::BFloat16>::size() == 2 * Vectorized<float>::size());
		auto zero = svreinterpret_bf16_f32(svdup_n_f32(0.0f));
		diff --git a/aten/src/ATen/native/cpu/Activation.cpp b/aten/src/ATen/native/cpu/Activation.cpp
		index 52d5383e60f..00c9f4eb253 100644
		--- a/aten/src/ATen/native/cpu/Activation.cpp
		+++ b/aten/src/ATen/native/cpu/Activation.cpp
		@@ -26,6 +26,10 @@ namespace at::native {

		namespace {

		+#if defined(__GNUC__) && __GNUC__ == 14 && defined(__aarch64__) && !defined(__ARM_FEATURE_SVE)
		+// Workaround for gcc-14.2.0 ICE during RTL pass: expand when compiling for NEON
		+__attribute__((optimize("no-tree-vectorize")))
		+#endif
		static void log_sigmoid_cpu_kernel(TensorBase &output, TensorBase &buffer, const TensorBase &input) {
		if (at::isReducedFloatingType(input.scalar_type())) {
		AT_DISPATCH_REDUCED_FLOATING_TYPES(input.scalar_type(), "log_sigmoid_cpu", [&]() {
		diff --git a/aten/src/ATen/native/cpu/Unfold2d.cpp b/aten/src/ATen/native/cpu/Unfold2d.cpp
		index 8ef0741e77a..8c94decfff0 100644
		--- a/aten/src/ATen/native/cpu/Unfold2d.cpp
		+++ b/aten/src/ATen/native/cpu/Unfold2d.cpp
		@@ -169,6 +169,10 @@ static void unfolded2d_acc_channels_last(

		/* note: due to write issues, this one cannot be parallelized as well as
		* unfolded2d_copy */
		+#if defined(__GNUC__) && __GNUC__ == 14 && defined(__ARM_FEATURE_SVE) && !defined(__ARM_FEATURE_BF16)
		+// Workaround for gcc-14.2.0 ICE during RTL pass: vregs when compiling for SVE without BF16
		+__attribute__((optimize("no-tree-vectorize")))
		+#endif
		void unfolded2d_acc_kernel(
		ScalarType dtype,
		void *finput_data,

Admin message