Loading pkgs/development/python-modules/bitsandbytes/default.nix +5 −5 Original line number Diff line number Diff line Loading @@ -11,7 +11,7 @@ let pname = "bitsandbytes"; version = "0.45.1"; version = "0.46.0"; inherit (torch) cudaPackages cudaSupport; inherit (cudaPackages) cudaMajorMinorVersion; Loading Loading @@ -54,10 +54,10 @@ buildPythonPackage { pyproject = true; src = fetchFromGitHub { owner = "TimDettmers"; owner = "bitsandbytes-foundation"; repo = "bitsandbytes"; tag = version; hash = "sha256-MZ+3mUXaAhRb+rBtE+eQqT3XdtFxlWJc/CmTEwQkKSA="; hash = "sha256-q1ltNYO5Ex6F2bfCcsekdsWjzXoal7g4n/LIHVGuj+k="; }; # By default, which library is loaded depends on the result of `torch.cuda.is_available()`. Loading Loading @@ -112,8 +112,8 @@ buildPythonPackage { meta = { description = "8-bit CUDA functions for PyTorch"; homepage = "https://github.com/TimDettmers/bitsandbytes"; changelog = "https://github.com/TimDettmers/bitsandbytes/releases/tag/${version}"; homepage = "https://github.com/bitsandbytes-foundation/bitsandbytes"; changelog = "https://github.com/bitsandbytes-foundation/bitsandbytes/releases/tag/${version}"; license = lib.licenses.mit; maintainers = with lib.maintainers; [ bcdarwin ]; }; Loading pkgs/development/python-modules/vllm/0005-drop-intel-reqs.patch 0 → 100644 +12 −0 Original line number Diff line number Diff line diff --git a/requirements/cpu.txt b/requirements/cpu.txt index 121330158..d41918883 100644 --- a/requirements/cpu.txt +++ b/requirements/cpu.txt @@ -20,7 +20,3 @@ datasets # for benchmark scripts # cpu cannot use triton 3.3.0 triton==3.2.0; platform_machine == "x86_64" - -# Intel Extension for PyTorch, only for x86_64 CPUs -intel-openmp==2024.2.1; platform_machine == "x86_64" -intel_extension_for_pytorch==2.7.0; platform_machine == "x86_64" pkgs/development/python-modules/vllm/default.nix +59 −23 Original line number Diff line number Diff line Loading @@ -3,8 +3,8 @@ stdenv, python, buildPythonPackage, pythonRelaxDepsHook, fetchFromGitHub, fetchpatch, symlinkJoin, autoAddDriverRunpath, Loading @@ -15,7 +15,6 @@ packaging, setuptools, setuptools-scm, wheel, # dependencies which, Loading Loading @@ -63,6 +62,14 @@ python-json-logger, python-multipart, llvmPackages, opentelemetry-sdk, opentelemetry-api, opentelemetry-exporter-otlp, bitsandbytes, flashinfer, # internal dependency - for overriding in overlays vllm-flash-attn ? null, cudaSupport ? torch.cudaSupport, cudaPackages ? { }, Loading @@ -88,8 +95,8 @@ let cutlass = fetchFromGitHub { owner = "NVIDIA"; repo = "cutlass"; tag = "v3.8.0"; hash = "sha256-oIzlbKRdOh6gp6nRZ8udLSqleBFoFtgM7liCBlHZLOk="; tag = "v3.9.2"; hash = "sha256-teziPNA9csYvhkG5t2ht8W8x5+1YGGbHm8VKx4JoxgI="; }; flashmla = stdenv.mkDerivation { Loading Loading @@ -119,36 +126,41 @@ let ''; }; vllm-flash-attn = stdenv.mkDerivation { vllm-flash-attn' = lib.defaultTo (stdenv.mkDerivation { pname = "vllm-flash-attn"; # https://github.com/vllm-project/flash-attention/blob/${src.rev}/vllm_flash_attn/__init__.py version = "2.7.2.post1"; version = "2.7.4.post1"; # grep for GIT_TAG in the following file # https://github.com/vllm-project/vllm/blob/v${version}/cmake/external_projects/vllm_flash_attn.cmake src = fetchFromGitHub { owner = "vllm-project"; repo = "flash-attention"; rev = "dc9d410b3e2d6534a4c70724c2515f4def670a22"; hash = "sha256-ZQ0bOBIb+8IMmya8dmimKQ17KTBplX81IirdnBJpX5M="; rev = "8798f27777fb57f447070301bf33a9f9c607f491"; hash = "sha256-UTUvATGN1NU/Bc8qo078q6bEgILLmlrjL7Yk2iAJhg4="; }; dontConfigure = true; # vllm-flash-attn normally relies on `git submodule update` to fetch cutlass buildPhase = '' # vllm-flash-attn normally relies on `git submodule update` to fetch cutlass and composable_kernel buildPhase = '' rm -rf csrc/cutlass ln -sf ${cutlass} csrc/cutlass '' + lib.optionalString (rocmSupport) '' rm -rf csrc/composable_kernel; ln -sf ${rocmPackages.composable_kernel} csrc/composable_kernel ''; installPhase = '' cp -rva . $out ''; }; }) vllm-flash-attn; cpuSupport = !cudaSupport && !rocmSupport; # https://github.com/pytorch/pytorch/blob/v2.6.0/torch/utils/cpp_extension.py#L2046-L2048 # https://github.com/pytorch/pytorch/blob/v2.7.0/torch/utils/cpp_extension.py#L2343-L2345 supportedTorchCudaCapabilities = let real = [ Loading @@ -170,6 +182,11 @@ let "9.0" "9.0a" "10.0" "10.0a" "10.1" "10.1a" "12.0" "12.0a" ]; ptx = lists.map (x: "${x}+PTX") real; in Loading Loading @@ -229,7 +246,7 @@ in buildPythonPackage rec { pname = "vllm"; version = "0.8.3"; version = "0.9.0.1"; pyproject = true; stdenv = torch.stdenv; Loading @@ -238,13 +255,19 @@ buildPythonPackage rec { owner = "vllm-project"; repo = "vllm"; tag = "v${version}"; hash = "sha256-LiEBkVwJTT4WoCTk9pI0ykTjmv1pDMzksmFwVktoxMY="; hash = "sha256-gNe/kdsDQno8Fd6mo29feWmbyC0c2+kljlVxY4v7R9U="; }; patches = [ (fetchpatch { name = "remove-unused-opentelemetry-semantic-conventions-ai-dep.patch"; url = "https://github.com/vllm-project/vllm/commit/6a5d7e45f52c3a13de43b8b4fa9033e3b342ebd2.patch"; hash = "sha256-KYthqu+6XwsYYd80PtfrMMjuRV9+ionccr7EbjE4jJE="; }) ./0002-setup.py-nix-support-respect-cmakeFlags.patch ./0003-propagate-pythonpath.patch ./0004-drop-lsmod.patch ./0005-drop-intel-reqs.patch ]; postPatch = Loading @@ -259,6 +282,10 @@ buildPythonPackage rec { --replace-fail \ 'set(PYTHON_SUPPORTED_VERSIONS' \ 'set(PYTHON_SUPPORTED_VERSIONS "${lib.versions.majorMinor python.version}"' # Pass build environment PYTHONPATH to vLLM's Python configuration scripts substituteInPlace CMakeLists.txt \ --replace-fail '$PYTHONPATH' '$ENV{PYTHONPATH}' '' + lib.optionalString (nccl == null) '' # On platforms where NCCL is not supported (e.g. Jetson), substitute Gloo (provided by Torch) Loading Loading @@ -361,12 +388,17 @@ buildPythonPackage rec { xformers xgrammar numba opentelemetry-sdk opentelemetry-api opentelemetry-exporter-otlp bitsandbytes ] ++ uvicorn.optional-dependencies.standard ++ aioprometheus.optional-dependencies.starlette ++ lib.optionals cudaSupport [ cupy pynvml flashinfer ]; dontUseCmakeConfigure = true; Loading @@ -374,7 +406,7 @@ buildPythonPackage rec { [ (lib.cmakeFeature "FETCHCONTENT_SOURCE_DIR_CUTLASS" "${lib.getDev cutlass}") (lib.cmakeFeature "FLASH_MLA_SRC_DIR" "${lib.getDev flashmla}") (lib.cmakeFeature "VLLM_FLASH_ATTN_SRC_DIR" "${lib.getDev vllm-flash-attn}") (lib.cmakeFeature "VLLM_FLASH_ATTN_SRC_DIR" "${lib.getDev vllm-flash-attn'}") ] ++ lib.optionals cudaSupport [ (lib.cmakeFeature "TORCH_CUDA_ARCH_LIST" "${gpuTargetString}") Loading Loading @@ -416,15 +448,19 @@ buildPythonPackage rec { pythonImportsCheck = [ "vllm" ]; passthru = { # make internal dependency available to overlays vllm-flash-attn = vllm-flash-attn'; # updates the cutlass fetcher instead passthru.skipBulkUpdate = true; skipBulkUpdate = true; }; meta = with lib; { meta = { description = "High-throughput and memory-efficient inference and serving engine for LLMs"; changelog = "https://github.com/vllm-project/vllm/releases/tag/v${version}"; homepage = "https://github.com/vllm-project/vllm"; license = licenses.asl20; maintainers = with maintainers; [ license = lib.licenses.asl20; maintainers = with lib.maintainers; [ happysalada lach ]; Loading Loading
pkgs/development/python-modules/bitsandbytes/default.nix +5 −5 Original line number Diff line number Diff line Loading @@ -11,7 +11,7 @@ let pname = "bitsandbytes"; version = "0.45.1"; version = "0.46.0"; inherit (torch) cudaPackages cudaSupport; inherit (cudaPackages) cudaMajorMinorVersion; Loading Loading @@ -54,10 +54,10 @@ buildPythonPackage { pyproject = true; src = fetchFromGitHub { owner = "TimDettmers"; owner = "bitsandbytes-foundation"; repo = "bitsandbytes"; tag = version; hash = "sha256-MZ+3mUXaAhRb+rBtE+eQqT3XdtFxlWJc/CmTEwQkKSA="; hash = "sha256-q1ltNYO5Ex6F2bfCcsekdsWjzXoal7g4n/LIHVGuj+k="; }; # By default, which library is loaded depends on the result of `torch.cuda.is_available()`. Loading Loading @@ -112,8 +112,8 @@ buildPythonPackage { meta = { description = "8-bit CUDA functions for PyTorch"; homepage = "https://github.com/TimDettmers/bitsandbytes"; changelog = "https://github.com/TimDettmers/bitsandbytes/releases/tag/${version}"; homepage = "https://github.com/bitsandbytes-foundation/bitsandbytes"; changelog = "https://github.com/bitsandbytes-foundation/bitsandbytes/releases/tag/${version}"; license = lib.licenses.mit; maintainers = with lib.maintainers; [ bcdarwin ]; }; Loading
pkgs/development/python-modules/vllm/0005-drop-intel-reqs.patch 0 → 100644 +12 −0 Original line number Diff line number Diff line diff --git a/requirements/cpu.txt b/requirements/cpu.txt index 121330158..d41918883 100644 --- a/requirements/cpu.txt +++ b/requirements/cpu.txt @@ -20,7 +20,3 @@ datasets # for benchmark scripts # cpu cannot use triton 3.3.0 triton==3.2.0; platform_machine == "x86_64" - -# Intel Extension for PyTorch, only for x86_64 CPUs -intel-openmp==2024.2.1; platform_machine == "x86_64" -intel_extension_for_pytorch==2.7.0; platform_machine == "x86_64"
pkgs/development/python-modules/vllm/default.nix +59 −23 Original line number Diff line number Diff line Loading @@ -3,8 +3,8 @@ stdenv, python, buildPythonPackage, pythonRelaxDepsHook, fetchFromGitHub, fetchpatch, symlinkJoin, autoAddDriverRunpath, Loading @@ -15,7 +15,6 @@ packaging, setuptools, setuptools-scm, wheel, # dependencies which, Loading Loading @@ -63,6 +62,14 @@ python-json-logger, python-multipart, llvmPackages, opentelemetry-sdk, opentelemetry-api, opentelemetry-exporter-otlp, bitsandbytes, flashinfer, # internal dependency - for overriding in overlays vllm-flash-attn ? null, cudaSupport ? torch.cudaSupport, cudaPackages ? { }, Loading @@ -88,8 +95,8 @@ let cutlass = fetchFromGitHub { owner = "NVIDIA"; repo = "cutlass"; tag = "v3.8.0"; hash = "sha256-oIzlbKRdOh6gp6nRZ8udLSqleBFoFtgM7liCBlHZLOk="; tag = "v3.9.2"; hash = "sha256-teziPNA9csYvhkG5t2ht8W8x5+1YGGbHm8VKx4JoxgI="; }; flashmla = stdenv.mkDerivation { Loading Loading @@ -119,36 +126,41 @@ let ''; }; vllm-flash-attn = stdenv.mkDerivation { vllm-flash-attn' = lib.defaultTo (stdenv.mkDerivation { pname = "vllm-flash-attn"; # https://github.com/vllm-project/flash-attention/blob/${src.rev}/vllm_flash_attn/__init__.py version = "2.7.2.post1"; version = "2.7.4.post1"; # grep for GIT_TAG in the following file # https://github.com/vllm-project/vllm/blob/v${version}/cmake/external_projects/vllm_flash_attn.cmake src = fetchFromGitHub { owner = "vllm-project"; repo = "flash-attention"; rev = "dc9d410b3e2d6534a4c70724c2515f4def670a22"; hash = "sha256-ZQ0bOBIb+8IMmya8dmimKQ17KTBplX81IirdnBJpX5M="; rev = "8798f27777fb57f447070301bf33a9f9c607f491"; hash = "sha256-UTUvATGN1NU/Bc8qo078q6bEgILLmlrjL7Yk2iAJhg4="; }; dontConfigure = true; # vllm-flash-attn normally relies on `git submodule update` to fetch cutlass buildPhase = '' # vllm-flash-attn normally relies on `git submodule update` to fetch cutlass and composable_kernel buildPhase = '' rm -rf csrc/cutlass ln -sf ${cutlass} csrc/cutlass '' + lib.optionalString (rocmSupport) '' rm -rf csrc/composable_kernel; ln -sf ${rocmPackages.composable_kernel} csrc/composable_kernel ''; installPhase = '' cp -rva . $out ''; }; }) vllm-flash-attn; cpuSupport = !cudaSupport && !rocmSupport; # https://github.com/pytorch/pytorch/blob/v2.6.0/torch/utils/cpp_extension.py#L2046-L2048 # https://github.com/pytorch/pytorch/blob/v2.7.0/torch/utils/cpp_extension.py#L2343-L2345 supportedTorchCudaCapabilities = let real = [ Loading @@ -170,6 +182,11 @@ let "9.0" "9.0a" "10.0" "10.0a" "10.1" "10.1a" "12.0" "12.0a" ]; ptx = lists.map (x: "${x}+PTX") real; in Loading Loading @@ -229,7 +246,7 @@ in buildPythonPackage rec { pname = "vllm"; version = "0.8.3"; version = "0.9.0.1"; pyproject = true; stdenv = torch.stdenv; Loading @@ -238,13 +255,19 @@ buildPythonPackage rec { owner = "vllm-project"; repo = "vllm"; tag = "v${version}"; hash = "sha256-LiEBkVwJTT4WoCTk9pI0ykTjmv1pDMzksmFwVktoxMY="; hash = "sha256-gNe/kdsDQno8Fd6mo29feWmbyC0c2+kljlVxY4v7R9U="; }; patches = [ (fetchpatch { name = "remove-unused-opentelemetry-semantic-conventions-ai-dep.patch"; url = "https://github.com/vllm-project/vllm/commit/6a5d7e45f52c3a13de43b8b4fa9033e3b342ebd2.patch"; hash = "sha256-KYthqu+6XwsYYd80PtfrMMjuRV9+ionccr7EbjE4jJE="; }) ./0002-setup.py-nix-support-respect-cmakeFlags.patch ./0003-propagate-pythonpath.patch ./0004-drop-lsmod.patch ./0005-drop-intel-reqs.patch ]; postPatch = Loading @@ -259,6 +282,10 @@ buildPythonPackage rec { --replace-fail \ 'set(PYTHON_SUPPORTED_VERSIONS' \ 'set(PYTHON_SUPPORTED_VERSIONS "${lib.versions.majorMinor python.version}"' # Pass build environment PYTHONPATH to vLLM's Python configuration scripts substituteInPlace CMakeLists.txt \ --replace-fail '$PYTHONPATH' '$ENV{PYTHONPATH}' '' + lib.optionalString (nccl == null) '' # On platforms where NCCL is not supported (e.g. Jetson), substitute Gloo (provided by Torch) Loading Loading @@ -361,12 +388,17 @@ buildPythonPackage rec { xformers xgrammar numba opentelemetry-sdk opentelemetry-api opentelemetry-exporter-otlp bitsandbytes ] ++ uvicorn.optional-dependencies.standard ++ aioprometheus.optional-dependencies.starlette ++ lib.optionals cudaSupport [ cupy pynvml flashinfer ]; dontUseCmakeConfigure = true; Loading @@ -374,7 +406,7 @@ buildPythonPackage rec { [ (lib.cmakeFeature "FETCHCONTENT_SOURCE_DIR_CUTLASS" "${lib.getDev cutlass}") (lib.cmakeFeature "FLASH_MLA_SRC_DIR" "${lib.getDev flashmla}") (lib.cmakeFeature "VLLM_FLASH_ATTN_SRC_DIR" "${lib.getDev vllm-flash-attn}") (lib.cmakeFeature "VLLM_FLASH_ATTN_SRC_DIR" "${lib.getDev vllm-flash-attn'}") ] ++ lib.optionals cudaSupport [ (lib.cmakeFeature "TORCH_CUDA_ARCH_LIST" "${gpuTargetString}") Loading Loading @@ -416,15 +448,19 @@ buildPythonPackage rec { pythonImportsCheck = [ "vllm" ]; passthru = { # make internal dependency available to overlays vllm-flash-attn = vllm-flash-attn'; # updates the cutlass fetcher instead passthru.skipBulkUpdate = true; skipBulkUpdate = true; }; meta = with lib; { meta = { description = "High-throughput and memory-efficient inference and serving engine for LLMs"; changelog = "https://github.com/vllm-project/vllm/releases/tag/v${version}"; homepage = "https://github.com/vllm-project/vllm"; license = licenses.asl20; maintainers = with maintainers; [ license = lib.licenses.asl20; maintainers = with lib.maintainers; [ happysalada lach ]; Loading