Loading pkgs/development/python-modules/torch/0001-cmake.py-propagate-cmakeFlags-from-environment.patch 0 → 100644 +29 −0 Original line number Diff line number Diff line From c5d4087519eae6f41c80bbd8ffbcc9390db44c7f Mon Sep 17 00:00:00 2001 From: SomeoneSerge <else+aalto@someonex.net> Date: Thu, 10 Oct 2024 19:19:18 +0000 Subject: [PATCH] cmake.py: propagate cmakeFlags from environment --- tools/setup_helpers/cmake.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tools/setup_helpers/cmake.py b/tools/setup_helpers/cmake.py index 4b605fe5975..ea1d6a1ef46 100644 --- a/tools/setup_helpers/cmake.py +++ b/tools/setup_helpers/cmake.py @@ -332,6 +332,12 @@ class CMake: file=sys.stderr, ) print(e, file=sys.stderr) + + # Nixpkgs compat: + if "cmakeFlags" in os.environ: + import shlex + args.extend(shlex.split(os.environ["cmakeFlags"])) + # According to the CMake manual, we should pass the arguments first, # and put the directory as the last element. Otherwise, these flags # may not be passed correctly. -- 2.46.0 pkgs/development/python-modules/torch/default.nix +46 −10 Original line number Diff line number Diff line Loading @@ -35,10 +35,8 @@ removeReferencesTo, # Build inputs darwin, numactl, Accelerate, CoreServices, libobjc, # Propagated build inputs astunparse, Loading @@ -56,6 +54,17 @@ tritonSupport ? (!stdenv.hostPlatform.isDarwin), triton, # TODO: 1. callPackage needs to learn to distinguish between the task # of "asking for an attribute from the parent scope" and # the task of "exposing a formal parameter in .override". # TODO: 2. We should probably abandon attributes such as `torchWithCuda` (etc.) # as they routinely end up consuming the wrong arguments\ # (dependencies without cuda support). # Instead we should rely on overlays and nixpkgsFun. # (@SomeoneSerge) _tritonEffective ? if cudaSupport then triton-cuda else triton, triton-cuda, # Unit tests hypothesis, psutil, Loading Loading @@ -95,6 +104,8 @@ let ; inherit (cudaPackages) cudaFlags cudnn nccl; triton = throw "python3Packages.torch: use _tritonEffective instead of triton to avoid divergence"; rocmPackages = rocmPackages_5; setBool = v: if v then "1" else "0"; Loading Loading @@ -240,6 +251,7 @@ buildPythonPackage rec { # Allow setting PYTHON_LIB_REL_PATH with an environment variable. # https://github.com/pytorch/pytorch/pull/128419 ./passthrough-python-lib-rel-path.patch ./0001-cmake.py-propagate-cmakeFlags-from-environment.patch ] ++ lib.optionals cudaSupport [ ./fix-cmake-cuda-toolkit.patch ] ++ lib.optionals (stdenv.hostPlatform.isDarwin && stdenv.hostPlatform.isx86_64) [ Loading @@ -257,7 +269,18 @@ buildPythonPackage rec { ]; postPatch = lib.optionalString rocmSupport '' '' substituteInPlace cmake/public/cuda.cmake \ --replace-fail \ 'message(FATAL_ERROR "Found two conflicting CUDA' \ 'message(WARNING "Found two conflicting CUDA' \ --replace-warn \ "set(CUDAToolkit_ROOT" \ "# Upstream: set(CUDAToolkit_ROOT" substituteInPlace third_party/gloo/cmake/Cuda.cmake \ --replace-warn "find_package(CUDAToolkit 7.0" "find_package(CUDAToolkit" '' + lib.optionalString rocmSupport '' # https://github.com/facebookincubator/gloo/pull/297 substituteInPlace third_party/gloo/cmake/Hipify.cmake \ --replace "\''${HIPIFY_COMMAND}" "python \''${HIPIFY_COMMAND}" Loading Loading @@ -351,6 +374,17 @@ buildPythonPackage rec { # NB technical debt: building without NNPACK as workaround for missing `six` USE_NNPACK = 0; cmakeFlags = [ # (lib.cmakeBool "CMAKE_FIND_DEBUG_MODE" true) (lib.cmakeFeature "CUDAToolkit_VERSION" cudaPackages.cudaVersion) ] ++ lib.optionals cudaSupport [ # Unbreaks version discovery in enable_language(CUDA) when wrapping nvcc with ccache # Cf. https://gitlab.kitware.com/cmake/cmake/-/issues/26363 (lib.cmakeFeature "CMAKE_CUDA_COMPILER_TOOLKIT_VERSION" cudaPackages.cudaVersion) ]; preBuild = '' export MAX_JOBS=$NIX_BUILD_CORES ${python.pythonOnBuildForHost.interpreter} setup.py build --cmake-only Loading Loading @@ -495,11 +529,11 @@ buildPythonPackage rec { ++ lib.optionals (cudaSupport || rocmSupport) [ effectiveMagma ] ++ lib.optionals stdenv.hostPlatform.isLinux [ numactl ] ++ lib.optionals stdenv.hostPlatform.isDarwin [ Accelerate CoreServices libobjc darwin.apple_sdk.frameworks.Accelerate darwin.apple_sdk.frameworks.CoreServices darwin.libobjc ] ++ lib.optionals tritonSupport [ triton ] ++ lib.optionals tritonSupport [ _tritonEffective ] ++ lib.optionals MPISupport [ mpi ] ++ lib.optionals rocmSupport [ rocmtoolkit_joined ]; Loading Loading @@ -527,7 +561,7 @@ buildPythonPackage rec { # torch/csrc requires `pybind11` at runtime pybind11 ] ++ lib.optionals tritonSupport [ triton ]; ] ++ lib.optionals tritonSupport [ _tritonEffective ]; propagatedCxxBuildInputs = [ ] ++ lib.optionals MPISupport [ mpi ] ++ lib.optionals rocmSupport [ rocmtoolkit_joined ]; Loading Loading @@ -662,7 +696,9 @@ buildPythonPackage rec { thoughtpolice tscholak ]; # tscholak esp. for darwin-related builds platforms = with lib.platforms; linux ++ lib.optionals (!cudaSupport && !rocmSupport) darwin; platforms = lib.platforms.linux ++ lib.optionals (!cudaSupport && !rocmSupport) lib.platforms.darwin; broken = builtins.any trivial.id (builtins.attrValues brokenConditions); }; } pkgs/development/python-modules/triton/default.nix +85 −18 Original line number Diff line number Diff line Loading @@ -15,7 +15,8 @@ ninja, pybind11, python, runCommand, pytestCheckHook, stdenv, substituteAll, setuptools, torchWithRocm, Loading @@ -23,6 +24,7 @@ cudaSupport ? config.cudaSupport, rocmSupport ? config.rocmSupport, rocmPackages, triton, }: buildPythonPackage { Loading Loading @@ -164,16 +166,10 @@ buildPythonPackage { # CMake is run by setup.py instead dontUseCmakeConfigure = true; nativeCheckInputs = [ cmake # Requires torch (circular dependency) and GPU access: pytestCheckHook ]; nativeCheckInputs = [ cmake ]; preCheck = '' # build/temp* refers to build_ext.build_temp (looked up in the build logs) (cd ./build/temp* ; ctest) # For pytestCheckHook cd test/unit ''; pythonImportsCheck = [ Loading @@ -181,20 +177,91 @@ buildPythonPackage { "triton.language" ]; # Ultimately, torch is our test suite: passthru.gpuCheck = stdenv.mkDerivation { pname = "triton-pytest"; inherit (triton) version src; requiredSystemFeatures = [ "cuda" ]; nativeBuildInputs = [ (python.withPackages (ps: [ ps.scipy ps.torchWithCuda ps.triton-cuda ])) ]; dontBuild = true; nativeCheckInputs = [ pytestCheckHook ]; doCheck = true; preCheck = '' cd python/test/unit export HOME=$TMPDIR ''; checkPhase = "pytestCheckPhase"; installPhase = "touch $out"; }; passthru.tests = { # Ultimately, torch is our test suite: inherit torchWithRocm; # Implemented as alternative to pythonImportsCheck, in case if circular dependency on torch occurs again, # and pythonImportsCheck is commented back. import-triton = runCommand "import-triton" { nativeBuildInputs = [ (python.withPackages (ps: [ ps.triton ])) ]; } # Test as `nix run -f "<nixpkgs>" python3Packages.triton.tests.axpy-cuda` # or, using `programs.nix-required-mounts`, as `nix build -f "<nixpkgs>" python3Packages.triton.tests.axpy-cuda.gpuCheck` axpy-cuda = cudaPackages.writeGpuTestPython { libraries = ps: [ ps.triton ps.torch-no-triton ]; } '' python << \EOF # Adopted from Philippe Tillet https://triton-lang.org/main/getting-started/tutorials/01-vector-add.html import triton import triton.language EOF touch "$out" import triton.language as tl import torch import os @triton.jit def axpy_kernel(n, a: tl.constexpr, x_ptr, y_ptr, out, BLOCK_SIZE: tl.constexpr): pid = tl.program_id(axis=0) block_start = pid * BLOCK_SIZE offsets = block_start + tl.arange(0, BLOCK_SIZE) mask = offsets < n x = tl.load(x_ptr + offsets, mask=mask) y = tl.load(y_ptr + offsets, mask=mask) output = a * x + y tl.store(out + offsets, output, mask=mask) def axpy(a, x, y): output = torch.empty_like(x) assert x.is_cuda and y.is_cuda and output.is_cuda n_elements = output.numel() def grid(meta): return (triton.cdiv(n_elements, meta['BLOCK_SIZE']), ) axpy_kernel[grid](n_elements, a, x, y, output, BLOCK_SIZE=1024) return output if __name__ == "__main__": if os.environ.get("HOME", None) == "/homeless-shelter": os.environ["HOME"] = os.environ.get("TMPDIR", "/tmp") if "CC" not in os.environ: os.environ["CC"] = "${lib.getExe' cudaPackages.backendStdenv.cc "cc"}" torch.manual_seed(0) size = 12345 x = torch.rand(size, device='cuda') y = torch.rand(size, device='cuda') output_torch = 3.14 * x + y output_triton = axpy(3.14, x, y) assert output_torch.sub(output_triton).abs().max().item() < 1e-6 print("Triton axpy: OK") ''; }; Loading pkgs/top-level/python-packages.nix +4 −4 Original line number Diff line number Diff line Loading @@ -15717,10 +15717,10 @@ self: super: with self; { toposort = callPackage ../development/python-modules/toposort { }; torch = callPackage ../development/python-modules/torch { inherit (pkgs.darwin.apple_sdk.frameworks) Accelerate CoreServices; inherit (pkgs.darwin) libobjc; }; torch = callPackage ../development/python-modules/torch { }; # Required to test triton torch-no-triton = self.torch.override { tritonSupport = false; }; torch-audiomentations = callPackage ../development/python-modules/torch-audiomentations { }; Loading
pkgs/development/python-modules/torch/0001-cmake.py-propagate-cmakeFlags-from-environment.patch 0 → 100644 +29 −0 Original line number Diff line number Diff line From c5d4087519eae6f41c80bbd8ffbcc9390db44c7f Mon Sep 17 00:00:00 2001 From: SomeoneSerge <else+aalto@someonex.net> Date: Thu, 10 Oct 2024 19:19:18 +0000 Subject: [PATCH] cmake.py: propagate cmakeFlags from environment --- tools/setup_helpers/cmake.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tools/setup_helpers/cmake.py b/tools/setup_helpers/cmake.py index 4b605fe5975..ea1d6a1ef46 100644 --- a/tools/setup_helpers/cmake.py +++ b/tools/setup_helpers/cmake.py @@ -332,6 +332,12 @@ class CMake: file=sys.stderr, ) print(e, file=sys.stderr) + + # Nixpkgs compat: + if "cmakeFlags" in os.environ: + import shlex + args.extend(shlex.split(os.environ["cmakeFlags"])) + # According to the CMake manual, we should pass the arguments first, # and put the directory as the last element. Otherwise, these flags # may not be passed correctly. -- 2.46.0
pkgs/development/python-modules/torch/default.nix +46 −10 Original line number Diff line number Diff line Loading @@ -35,10 +35,8 @@ removeReferencesTo, # Build inputs darwin, numactl, Accelerate, CoreServices, libobjc, # Propagated build inputs astunparse, Loading @@ -56,6 +54,17 @@ tritonSupport ? (!stdenv.hostPlatform.isDarwin), triton, # TODO: 1. callPackage needs to learn to distinguish between the task # of "asking for an attribute from the parent scope" and # the task of "exposing a formal parameter in .override". # TODO: 2. We should probably abandon attributes such as `torchWithCuda` (etc.) # as they routinely end up consuming the wrong arguments\ # (dependencies without cuda support). # Instead we should rely on overlays and nixpkgsFun. # (@SomeoneSerge) _tritonEffective ? if cudaSupport then triton-cuda else triton, triton-cuda, # Unit tests hypothesis, psutil, Loading Loading @@ -95,6 +104,8 @@ let ; inherit (cudaPackages) cudaFlags cudnn nccl; triton = throw "python3Packages.torch: use _tritonEffective instead of triton to avoid divergence"; rocmPackages = rocmPackages_5; setBool = v: if v then "1" else "0"; Loading Loading @@ -240,6 +251,7 @@ buildPythonPackage rec { # Allow setting PYTHON_LIB_REL_PATH with an environment variable. # https://github.com/pytorch/pytorch/pull/128419 ./passthrough-python-lib-rel-path.patch ./0001-cmake.py-propagate-cmakeFlags-from-environment.patch ] ++ lib.optionals cudaSupport [ ./fix-cmake-cuda-toolkit.patch ] ++ lib.optionals (stdenv.hostPlatform.isDarwin && stdenv.hostPlatform.isx86_64) [ Loading @@ -257,7 +269,18 @@ buildPythonPackage rec { ]; postPatch = lib.optionalString rocmSupport '' '' substituteInPlace cmake/public/cuda.cmake \ --replace-fail \ 'message(FATAL_ERROR "Found two conflicting CUDA' \ 'message(WARNING "Found two conflicting CUDA' \ --replace-warn \ "set(CUDAToolkit_ROOT" \ "# Upstream: set(CUDAToolkit_ROOT" substituteInPlace third_party/gloo/cmake/Cuda.cmake \ --replace-warn "find_package(CUDAToolkit 7.0" "find_package(CUDAToolkit" '' + lib.optionalString rocmSupport '' # https://github.com/facebookincubator/gloo/pull/297 substituteInPlace third_party/gloo/cmake/Hipify.cmake \ --replace "\''${HIPIFY_COMMAND}" "python \''${HIPIFY_COMMAND}" Loading Loading @@ -351,6 +374,17 @@ buildPythonPackage rec { # NB technical debt: building without NNPACK as workaround for missing `six` USE_NNPACK = 0; cmakeFlags = [ # (lib.cmakeBool "CMAKE_FIND_DEBUG_MODE" true) (lib.cmakeFeature "CUDAToolkit_VERSION" cudaPackages.cudaVersion) ] ++ lib.optionals cudaSupport [ # Unbreaks version discovery in enable_language(CUDA) when wrapping nvcc with ccache # Cf. https://gitlab.kitware.com/cmake/cmake/-/issues/26363 (lib.cmakeFeature "CMAKE_CUDA_COMPILER_TOOLKIT_VERSION" cudaPackages.cudaVersion) ]; preBuild = '' export MAX_JOBS=$NIX_BUILD_CORES ${python.pythonOnBuildForHost.interpreter} setup.py build --cmake-only Loading Loading @@ -495,11 +529,11 @@ buildPythonPackage rec { ++ lib.optionals (cudaSupport || rocmSupport) [ effectiveMagma ] ++ lib.optionals stdenv.hostPlatform.isLinux [ numactl ] ++ lib.optionals stdenv.hostPlatform.isDarwin [ Accelerate CoreServices libobjc darwin.apple_sdk.frameworks.Accelerate darwin.apple_sdk.frameworks.CoreServices darwin.libobjc ] ++ lib.optionals tritonSupport [ triton ] ++ lib.optionals tritonSupport [ _tritonEffective ] ++ lib.optionals MPISupport [ mpi ] ++ lib.optionals rocmSupport [ rocmtoolkit_joined ]; Loading Loading @@ -527,7 +561,7 @@ buildPythonPackage rec { # torch/csrc requires `pybind11` at runtime pybind11 ] ++ lib.optionals tritonSupport [ triton ]; ] ++ lib.optionals tritonSupport [ _tritonEffective ]; propagatedCxxBuildInputs = [ ] ++ lib.optionals MPISupport [ mpi ] ++ lib.optionals rocmSupport [ rocmtoolkit_joined ]; Loading Loading @@ -662,7 +696,9 @@ buildPythonPackage rec { thoughtpolice tscholak ]; # tscholak esp. for darwin-related builds platforms = with lib.platforms; linux ++ lib.optionals (!cudaSupport && !rocmSupport) darwin; platforms = lib.platforms.linux ++ lib.optionals (!cudaSupport && !rocmSupport) lib.platforms.darwin; broken = builtins.any trivial.id (builtins.attrValues brokenConditions); }; }
pkgs/development/python-modules/triton/default.nix +85 −18 Original line number Diff line number Diff line Loading @@ -15,7 +15,8 @@ ninja, pybind11, python, runCommand, pytestCheckHook, stdenv, substituteAll, setuptools, torchWithRocm, Loading @@ -23,6 +24,7 @@ cudaSupport ? config.cudaSupport, rocmSupport ? config.rocmSupport, rocmPackages, triton, }: buildPythonPackage { Loading Loading @@ -164,16 +166,10 @@ buildPythonPackage { # CMake is run by setup.py instead dontUseCmakeConfigure = true; nativeCheckInputs = [ cmake # Requires torch (circular dependency) and GPU access: pytestCheckHook ]; nativeCheckInputs = [ cmake ]; preCheck = '' # build/temp* refers to build_ext.build_temp (looked up in the build logs) (cd ./build/temp* ; ctest) # For pytestCheckHook cd test/unit ''; pythonImportsCheck = [ Loading @@ -181,20 +177,91 @@ buildPythonPackage { "triton.language" ]; # Ultimately, torch is our test suite: passthru.gpuCheck = stdenv.mkDerivation { pname = "triton-pytest"; inherit (triton) version src; requiredSystemFeatures = [ "cuda" ]; nativeBuildInputs = [ (python.withPackages (ps: [ ps.scipy ps.torchWithCuda ps.triton-cuda ])) ]; dontBuild = true; nativeCheckInputs = [ pytestCheckHook ]; doCheck = true; preCheck = '' cd python/test/unit export HOME=$TMPDIR ''; checkPhase = "pytestCheckPhase"; installPhase = "touch $out"; }; passthru.tests = { # Ultimately, torch is our test suite: inherit torchWithRocm; # Implemented as alternative to pythonImportsCheck, in case if circular dependency on torch occurs again, # and pythonImportsCheck is commented back. import-triton = runCommand "import-triton" { nativeBuildInputs = [ (python.withPackages (ps: [ ps.triton ])) ]; } # Test as `nix run -f "<nixpkgs>" python3Packages.triton.tests.axpy-cuda` # or, using `programs.nix-required-mounts`, as `nix build -f "<nixpkgs>" python3Packages.triton.tests.axpy-cuda.gpuCheck` axpy-cuda = cudaPackages.writeGpuTestPython { libraries = ps: [ ps.triton ps.torch-no-triton ]; } '' python << \EOF # Adopted from Philippe Tillet https://triton-lang.org/main/getting-started/tutorials/01-vector-add.html import triton import triton.language EOF touch "$out" import triton.language as tl import torch import os @triton.jit def axpy_kernel(n, a: tl.constexpr, x_ptr, y_ptr, out, BLOCK_SIZE: tl.constexpr): pid = tl.program_id(axis=0) block_start = pid * BLOCK_SIZE offsets = block_start + tl.arange(0, BLOCK_SIZE) mask = offsets < n x = tl.load(x_ptr + offsets, mask=mask) y = tl.load(y_ptr + offsets, mask=mask) output = a * x + y tl.store(out + offsets, output, mask=mask) def axpy(a, x, y): output = torch.empty_like(x) assert x.is_cuda and y.is_cuda and output.is_cuda n_elements = output.numel() def grid(meta): return (triton.cdiv(n_elements, meta['BLOCK_SIZE']), ) axpy_kernel[grid](n_elements, a, x, y, output, BLOCK_SIZE=1024) return output if __name__ == "__main__": if os.environ.get("HOME", None) == "/homeless-shelter": os.environ["HOME"] = os.environ.get("TMPDIR", "/tmp") if "CC" not in os.environ: os.environ["CC"] = "${lib.getExe' cudaPackages.backendStdenv.cc "cc"}" torch.manual_seed(0) size = 12345 x = torch.rand(size, device='cuda') y = torch.rand(size, device='cuda') output_torch = 3.14 * x + y output_triton = axpy(3.14, x, y) assert output_torch.sub(output_triton).abs().max().item() < 1e-6 print("Triton axpy: OK") ''; }; Loading
pkgs/top-level/python-packages.nix +4 −4 Original line number Diff line number Diff line Loading @@ -15717,10 +15717,10 @@ self: super: with self; { toposort = callPackage ../development/python-modules/toposort { }; torch = callPackage ../development/python-modules/torch { inherit (pkgs.darwin.apple_sdk.frameworks) Accelerate CoreServices; inherit (pkgs.darwin) libobjc; }; torch = callPackage ../development/python-modules/torch { }; # Required to test triton torch-no-triton = self.torch.override { tritonSupport = false; }; torch-audiomentations = callPackage ../development/python-modules/torch-audiomentations { };