Loading pkgs/development/python-modules/mistral-common/default.nix +2 −0 Original line number Diff line number Diff line Loading @@ -6,6 +6,7 @@ numpy, pydantic, jsonschema, opencv-python-headless, sentencepiece, typing-extensions, tiktoken, Loading Loading @@ -37,6 +38,7 @@ buildPythonPackage rec { numpy pydantic jsonschema opencv-python-headless sentencepiece typing-extensions tiktoken Loading pkgs/development/python-modules/vllm/0001-setup.py-don-t-ask-for-hipcc-version.patchdeleted 100644 → 0 +0 −24 Original line number Diff line number Diff line From f6a7748bee79fc2e1898968fef844daacfa7860b Mon Sep 17 00:00:00 2001 From: SomeoneSerge <else@someonex.net> Date: Wed, 31 Jul 2024 12:02:53 +0000 Subject: [PATCH 1/2] setup.py: don't ask for hipcc --version --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 72ef26f1..01e006f9 100644 --- a/setup.py +++ b/setup.py @@ -279,6 +279,7 @@ def _install_punica() -> bool: def get_hipcc_rocm_version(): + return "0.0" # `hipcc --version` misbehaves ("unresolved paths") inside the nix sandbox # Run the hipcc --version command result = subprocess.run(['hipcc', '--version'], stdout=subprocess.PIPE, -- 2.45.1 pkgs/development/python-modules/vllm/0003-propagate-pythonpath.patch 0 → 100644 +12 −0 Original line number Diff line number Diff line diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index f5a02a5b..e830f987 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -482,6 +482,7 @@ def _run_in_subprocess(fn: Callable[[], _T]) -> _T: returned = subprocess.run( [sys.executable, "-m", "vllm.model_executor.models.registry"], input=input_bytes, + env={'PYTHONPATH': ':'.join(sys.path)}, capture_output=True) # check if the subprocess is successful pkgs/development/python-modules/vllm/0004-drop-lsmod.patch 0 → 100644 +18 −0 Original line number Diff line number Diff line --- a/setup.py +++ b/setup.py @@ -340,14 +340,7 @@ def _is_hpu() -> bool: out = subprocess.run(["hl-smi"], capture_output=True, check=True) is_hpu_available = out.returncode == 0 except (FileNotFoundError, PermissionError, subprocess.CalledProcessError): - if sys.platform.startswith("linux"): - try: - output = subprocess.check_output( - 'lsmod | grep habanalabs | wc -l', shell=True) - is_hpu_available = int(output) > 0 - except (ValueError, FileNotFoundError, PermissionError, - subprocess.CalledProcessError): - pass + is_hpu_available = False return is_hpu_available pkgs/development/python-modules/vllm/default.nix +249 −43 Original line number Diff line number Diff line Loading @@ -5,14 +5,21 @@ buildPythonPackage, pythonRelaxDepsHook, fetchFromGitHub, symlinkJoin, autoAddDriverRunpath, # build system packaging, setuptools, wheel, # dependencies which, ninja, cmake, packaging, setuptools, setuptools-scm, torch, outlines, wheel, psutil, ray, pandas, Loading @@ -21,43 +28,174 @@ numpy, transformers, xformers, xgrammar, fastapi, uvicorn, pydantic, aioprometheus, pynvml, openai, pyzmq, tiktoken, torchaudio, torchvision, py-cpuinfo, lm-format-enforcer, prometheus-fastapi-instrumentator, cupy, writeShellScript, gguf, einops, importlib-metadata, partial-json-parser, compressed-tensors, mistral-common, msgspec, numactl, tokenizers, oneDNN, blake3, depyf, opencv-python-headless, config, cudaSupport ? config.cudaSupport, cudaPackages ? { }, # Has to be either rocm or cuda, default to the free one rocmSupport ? !config.cudaSupport, rocmSupport ? config.rocmSupport, rocmPackages ? { }, gpuTargets ? [ ], }@args: let inherit (lib) lists strings trivial ; inherit (cudaPackages) cudaFlags; shouldUsePkg = pkg: if pkg != null && lib.meta.availableOn stdenv.hostPlatform pkg then pkg else null; # see CMakeLists.txt, grepping for GIT_TAG near cutlass # https://github.com/vllm-project/vllm/blob/${version}/CMakeLists.txt cutlass = fetchFromGitHub { owner = "NVIDIA"; repo = "cutlass"; rev = "refs/tags/v3.5.0"; sha256 = "sha256-D/s7eYsa5l/mfx73tE4mnFcTQdYqGmXa9d9TCryw4e4="; tag = "v3.7.0"; hash = "sha256-GUTRXmv3DiM/GN5Bvv2LYovMLKZMlMhoKv4O0g627gs="; }; vllm-flash-attn = stdenv.mkDerivation rec { pname = "vllm-flash-attn"; version = "2.6.2"; # see CMakeLists.txt, grepping for GIT_TAG near vllm-flash-attn # https://github.com/vllm-project/vllm/blob/${version}/CMakeLists.txt src = fetchFromGitHub { owner = "vllm-project"; repo = "flash-attention"; rev = "d4e09037abf588af1ec47d0e966b237ee376876c"; hash = "sha256-KFEsZlrwvCgvPzQ/pCLWcnbGq89mWE3yTDdtJSV9MII="; }; dontConfigure = true; # vllm-flash-attn normally relies on `git submodule update` to fetch cutlass buildPhase = '' rm -rf csrc/cutlass ln -sf ${cutlass} csrc/cutlass ''; installPhase = '' cp -rva . $out ''; }; cpuSupport = !cudaSupport && !rocmSupport; # https://github.com/pytorch/pytorch/blob/v2.4.0/torch/utils/cpp_extension.py#L1953 supportedTorchCudaCapabilities = let real = [ "3.5" "3.7" "5.0" "5.2" "5.3" "6.0" "6.1" "6.2" "7.0" "7.2" "7.5" "8.0" "8.6" "8.7" "8.9" "9.0" "9.0a" ]; ptx = lists.map (x: "${x}+PTX") real; in real ++ ptx; # NOTE: The lists.subtractLists function is perhaps a bit unintuitive. It subtracts the elements # of the first list *from* the second list. That means: # lists.subtractLists a b = b - a # For CUDA supportedCudaCapabilities = lists.intersectLists cudaFlags.cudaCapabilities supportedTorchCudaCapabilities; unsupportedCudaCapabilities = lists.subtractLists supportedCudaCapabilities cudaFlags.cudaCapabilities; isCudaJetson = cudaSupport && cudaPackages.cudaFlags.isJetsonBuild; # Use trivial.warnIf to print a warning if any unsupported GPU targets are specified. gpuArchWarner = supported: unsupported: trivial.throwIf (supported == [ ]) ( "No supported GPU targets specified. Requested GPU targets: " + strings.concatStringsSep ", " unsupported ) supported; # Create the gpuTargetString. gpuTargetString = strings.concatStringsSep ";" ( if gpuTargets != [ ] then # If gpuTargets is specified, it always takes priority. gpuTargets else if cudaSupport then gpuArchWarner supportedCudaCapabilities unsupportedCudaCapabilities else if rocmSupport then rocmPackages.clr.gpuTargets else throw "No GPU targets specified" ); mergedCudaLibraries = with cudaPackages; [ cuda_cudart # cuda_runtime.h, -lcudart cuda_cccl libcusparse # cusparse.h libcusolver # cusolverDn.h cuda_nvtx cuda_nvrtc libcublas ]; # Some packages are not available on all platforms nccl = shouldUsePkg (cudaPackages.nccl or null); getAllOutputs = p: [ (lib.getBin p) (lib.getLib p) (lib.getDev p) ]; in buildPythonPackage rec { pname = "vllm"; version = "0.6.2"; version = "0.7.1"; pyproject = true; stdenv = if cudaSupport then cudaPackages.backendStdenv else args.stdenv; Loading @@ -65,30 +203,54 @@ buildPythonPackage rec { src = fetchFromGitHub { owner = "vllm-project"; repo = pname; rev = "refs/tags/v${version}"; hash = "sha256-zUkqAPPhDRdN9rDQ2biCl1B+trV0xIHXub++v9zsQGo="; tag = "v${version}"; hash = "sha256-CImXKMEv+jHqngvcr8W6fQLiCo1mqmcZ0Ho0bfAgfbg="; }; patches = [ ./0001-setup.py-don-t-ask-for-hipcc-version.patch ./0002-setup.py-nix-support-respect-cmakeFlags.patch ./0003-propagate-pythonpath.patch ./0004-drop-lsmod.patch ]; # Ignore the python version check because it hard-codes minor versions and # lags behind `ray`'s python interpreter support postPatch = '' postPatch = '' substituteInPlace CMakeLists.txt \ --replace-fail \ 'set(PYTHON_SUPPORTED_VERSIONS' \ 'set(PYTHON_SUPPORTED_VERSIONS "${lib.versions.majorMinor python.version}"' # Relax torch dependency manually because the nonstandard requirements format # is not caught by pythonRelaxDeps substituteInPlace requirements*.txt pyproject.toml \ --replace-warn 'torch==2.5.1' 'torch==${lib.getVersion torch}' \ --replace-warn 'torch == 2.5.1' 'torch == ${lib.getVersion torch}' '' + lib.optionalString (nccl == null) '' # On platforms where NCCL is not supported (e.g. Jetson), substitute Gloo (provided by Torch) substituteInPlace vllm/distributed/parallel_state.py \ --replace-fail '"nccl"' '"gloo"' ''; nativeBuildInputs = [ nativeBuildInputs = [ cmake ninja pythonRelaxDepsHook which ] ++ lib.optionals rocmSupport [ rocmPackages.hipcc ]; ] ++ lib.optionals rocmSupport [ rocmPackages.hipcc ] ++ lib.optionals cudaSupport [ cudaPackages.cuda_nvcc autoAddDriverRunpath ] ++ lib.optionals isCudaJetson [ cudaPackages.autoAddCudaCompatRunpath ]; build-system = [ packaging Loading @@ -97,18 +259,22 @@ buildPythonPackage rec { ]; buildInputs = (lib.optionals cudaSupport ( with cudaPackages; [ cuda_cudart # cuda_runtime.h, -lcudart cuda_cccl libcusparse # cusparse.h libcusolver # cusolverDn.h cuda_nvcc cuda_nvtx libcublas setuptools-scm torch ] )) ++ (lib.optionals cpuSupport ([ numactl oneDNN ])) ++ ( lib.optionals cudaSupport mergedCudaLibraries ++ (with cudaPackages; [ nccl cudnn libcufile ]) ) ++ (lib.optionals rocmSupport ( with rocmPackages; [ Loading @@ -123,10 +289,13 @@ buildPythonPackage rec { dependencies = [ aioprometheus blake3 depyf fastapi lm-format-enforcer numpy openai opencv-python-headless outlines pandas prometheus-fastapi-instrumentator Loading @@ -138,27 +307,64 @@ buildPythonPackage rec { ray sentencepiece tiktoken tokenizers msgspec gguf einops importlib-metadata partial-json-parser compressed-tensors mistral-common torch torchaudio torchvision transformers uvicorn xformers xgrammar ] ++ uvicorn.optional-dependencies.standard ++ aioprometheus.optional-dependencies.starlette ++ lib.optionals cudaSupport [ cupy pynvml ]; dontUseCmakeConfigure = true; cmakeFlags = [ (lib.cmakeFeature "FETCHCONTENT_SOURCE_DIR_CUTLASS" "${lib.getDev cutlass}") ]; cmakeFlags = [ (lib.cmakeFeature "FETCHCONTENT_SOURCE_DIR_CUTLASS" "${lib.getDev cutlass}") (lib.cmakeFeature "VLLM_FLASH_ATTN_SRC_DIR" "${lib.getDev vllm-flash-attn}") ] ++ lib.optionals cudaSupport [ (lib.cmakeFeature "TORCH_CUDA_ARCH_LIST" "${gpuTargetString}") (lib.cmakeFeature "CUTLASS_NVCC_ARCHS_ENABLED" "${cudaPackages.cudaFlags.cmakeCudaArchitecturesString }") (lib.cmakeFeature "CUDA_TOOLKIT_ROOT_DIR" "${symlinkJoin { name = "cuda-merged-${cudaPackages.cudaVersion}"; paths = builtins.concatMap getAllOutputs mergedCudaLibraries; }}") (lib.cmakeFeature "CAFFE2_USE_CUDNN" "ON") (lib.cmakeFeature "CAFFE2_USE_CUFILE" "ON") (lib.cmakeFeature "CUTLASS_ENABLE_CUBLAS" "ON") ] ++ lib.optionals cpuSupport [ (lib.cmakeFeature "FETCHCONTENT_SOURCE_DIR_ONEDNN" "${lib.getDev oneDNN}") ]; env = lib.optionalAttrs cudaSupport { CUDA_HOME = "${lib.getDev cudaPackages.cuda_nvcc}"; } lib.optionalAttrs cudaSupport { VLLM_TARGET_DEVICE = "cuda"; CUDA_HOME = "${lib.getDev cudaPackages.cuda_nvcc}"; } // lib.optionalAttrs rocmSupport { VLLM_TARGET_DEVICE = "rocm"; # Otherwise it tries to enumerate host supported ROCM gfx archs, and that is not possible due to sandboxing. PYTORCH_ROCM_ARCH = lib.strings.concatStringsSep ";" rocmPackages.clr.gpuTargets; ROCM_HOME = "${rocmPackages.clr}"; } // lib.optionalAttrs cpuSupport { VLLM_TARGET_DEVICE = "cpu"; }; pythonRelaxDeps = true; Loading @@ -174,8 +380,8 @@ buildPythonPackage rec { happysalada lach ]; # RuntimeError: Unknown runtime environment broken = true; # broken = !cudaSupport && !rocmSupport; # CPU support relies on unpackaged dependency `intel_extension_for_pytorch` broken = cpuSupport; }; } Loading
pkgs/development/python-modules/mistral-common/default.nix +2 −0 Original line number Diff line number Diff line Loading @@ -6,6 +6,7 @@ numpy, pydantic, jsonschema, opencv-python-headless, sentencepiece, typing-extensions, tiktoken, Loading Loading @@ -37,6 +38,7 @@ buildPythonPackage rec { numpy pydantic jsonschema opencv-python-headless sentencepiece typing-extensions tiktoken Loading
pkgs/development/python-modules/vllm/0001-setup.py-don-t-ask-for-hipcc-version.patchdeleted 100644 → 0 +0 −24 Original line number Diff line number Diff line From f6a7748bee79fc2e1898968fef844daacfa7860b Mon Sep 17 00:00:00 2001 From: SomeoneSerge <else@someonex.net> Date: Wed, 31 Jul 2024 12:02:53 +0000 Subject: [PATCH 1/2] setup.py: don't ask for hipcc --version --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 72ef26f1..01e006f9 100644 --- a/setup.py +++ b/setup.py @@ -279,6 +279,7 @@ def _install_punica() -> bool: def get_hipcc_rocm_version(): + return "0.0" # `hipcc --version` misbehaves ("unresolved paths") inside the nix sandbox # Run the hipcc --version command result = subprocess.run(['hipcc', '--version'], stdout=subprocess.PIPE, -- 2.45.1
pkgs/development/python-modules/vllm/0003-propagate-pythonpath.patch 0 → 100644 +12 −0 Original line number Diff line number Diff line diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index f5a02a5b..e830f987 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -482,6 +482,7 @@ def _run_in_subprocess(fn: Callable[[], _T]) -> _T: returned = subprocess.run( [sys.executable, "-m", "vllm.model_executor.models.registry"], input=input_bytes, + env={'PYTHONPATH': ':'.join(sys.path)}, capture_output=True) # check if the subprocess is successful
pkgs/development/python-modules/vllm/0004-drop-lsmod.patch 0 → 100644 +18 −0 Original line number Diff line number Diff line --- a/setup.py +++ b/setup.py @@ -340,14 +340,7 @@ def _is_hpu() -> bool: out = subprocess.run(["hl-smi"], capture_output=True, check=True) is_hpu_available = out.returncode == 0 except (FileNotFoundError, PermissionError, subprocess.CalledProcessError): - if sys.platform.startswith("linux"): - try: - output = subprocess.check_output( - 'lsmod | grep habanalabs | wc -l', shell=True) - is_hpu_available = int(output) > 0 - except (ValueError, FileNotFoundError, PermissionError, - subprocess.CalledProcessError): - pass + is_hpu_available = False return is_hpu_available
pkgs/development/python-modules/vllm/default.nix +249 −43 Original line number Diff line number Diff line Loading @@ -5,14 +5,21 @@ buildPythonPackage, pythonRelaxDepsHook, fetchFromGitHub, symlinkJoin, autoAddDriverRunpath, # build system packaging, setuptools, wheel, # dependencies which, ninja, cmake, packaging, setuptools, setuptools-scm, torch, outlines, wheel, psutil, ray, pandas, Loading @@ -21,43 +28,174 @@ numpy, transformers, xformers, xgrammar, fastapi, uvicorn, pydantic, aioprometheus, pynvml, openai, pyzmq, tiktoken, torchaudio, torchvision, py-cpuinfo, lm-format-enforcer, prometheus-fastapi-instrumentator, cupy, writeShellScript, gguf, einops, importlib-metadata, partial-json-parser, compressed-tensors, mistral-common, msgspec, numactl, tokenizers, oneDNN, blake3, depyf, opencv-python-headless, config, cudaSupport ? config.cudaSupport, cudaPackages ? { }, # Has to be either rocm or cuda, default to the free one rocmSupport ? !config.cudaSupport, rocmSupport ? config.rocmSupport, rocmPackages ? { }, gpuTargets ? [ ], }@args: let inherit (lib) lists strings trivial ; inherit (cudaPackages) cudaFlags; shouldUsePkg = pkg: if pkg != null && lib.meta.availableOn stdenv.hostPlatform pkg then pkg else null; # see CMakeLists.txt, grepping for GIT_TAG near cutlass # https://github.com/vllm-project/vllm/blob/${version}/CMakeLists.txt cutlass = fetchFromGitHub { owner = "NVIDIA"; repo = "cutlass"; rev = "refs/tags/v3.5.0"; sha256 = "sha256-D/s7eYsa5l/mfx73tE4mnFcTQdYqGmXa9d9TCryw4e4="; tag = "v3.7.0"; hash = "sha256-GUTRXmv3DiM/GN5Bvv2LYovMLKZMlMhoKv4O0g627gs="; }; vllm-flash-attn = stdenv.mkDerivation rec { pname = "vllm-flash-attn"; version = "2.6.2"; # see CMakeLists.txt, grepping for GIT_TAG near vllm-flash-attn # https://github.com/vllm-project/vllm/blob/${version}/CMakeLists.txt src = fetchFromGitHub { owner = "vllm-project"; repo = "flash-attention"; rev = "d4e09037abf588af1ec47d0e966b237ee376876c"; hash = "sha256-KFEsZlrwvCgvPzQ/pCLWcnbGq89mWE3yTDdtJSV9MII="; }; dontConfigure = true; # vllm-flash-attn normally relies on `git submodule update` to fetch cutlass buildPhase = '' rm -rf csrc/cutlass ln -sf ${cutlass} csrc/cutlass ''; installPhase = '' cp -rva . $out ''; }; cpuSupport = !cudaSupport && !rocmSupport; # https://github.com/pytorch/pytorch/blob/v2.4.0/torch/utils/cpp_extension.py#L1953 supportedTorchCudaCapabilities = let real = [ "3.5" "3.7" "5.0" "5.2" "5.3" "6.0" "6.1" "6.2" "7.0" "7.2" "7.5" "8.0" "8.6" "8.7" "8.9" "9.0" "9.0a" ]; ptx = lists.map (x: "${x}+PTX") real; in real ++ ptx; # NOTE: The lists.subtractLists function is perhaps a bit unintuitive. It subtracts the elements # of the first list *from* the second list. That means: # lists.subtractLists a b = b - a # For CUDA supportedCudaCapabilities = lists.intersectLists cudaFlags.cudaCapabilities supportedTorchCudaCapabilities; unsupportedCudaCapabilities = lists.subtractLists supportedCudaCapabilities cudaFlags.cudaCapabilities; isCudaJetson = cudaSupport && cudaPackages.cudaFlags.isJetsonBuild; # Use trivial.warnIf to print a warning if any unsupported GPU targets are specified. gpuArchWarner = supported: unsupported: trivial.throwIf (supported == [ ]) ( "No supported GPU targets specified. Requested GPU targets: " + strings.concatStringsSep ", " unsupported ) supported; # Create the gpuTargetString. gpuTargetString = strings.concatStringsSep ";" ( if gpuTargets != [ ] then # If gpuTargets is specified, it always takes priority. gpuTargets else if cudaSupport then gpuArchWarner supportedCudaCapabilities unsupportedCudaCapabilities else if rocmSupport then rocmPackages.clr.gpuTargets else throw "No GPU targets specified" ); mergedCudaLibraries = with cudaPackages; [ cuda_cudart # cuda_runtime.h, -lcudart cuda_cccl libcusparse # cusparse.h libcusolver # cusolverDn.h cuda_nvtx cuda_nvrtc libcublas ]; # Some packages are not available on all platforms nccl = shouldUsePkg (cudaPackages.nccl or null); getAllOutputs = p: [ (lib.getBin p) (lib.getLib p) (lib.getDev p) ]; in buildPythonPackage rec { pname = "vllm"; version = "0.6.2"; version = "0.7.1"; pyproject = true; stdenv = if cudaSupport then cudaPackages.backendStdenv else args.stdenv; Loading @@ -65,30 +203,54 @@ buildPythonPackage rec { src = fetchFromGitHub { owner = "vllm-project"; repo = pname; rev = "refs/tags/v${version}"; hash = "sha256-zUkqAPPhDRdN9rDQ2biCl1B+trV0xIHXub++v9zsQGo="; tag = "v${version}"; hash = "sha256-CImXKMEv+jHqngvcr8W6fQLiCo1mqmcZ0Ho0bfAgfbg="; }; patches = [ ./0001-setup.py-don-t-ask-for-hipcc-version.patch ./0002-setup.py-nix-support-respect-cmakeFlags.patch ./0003-propagate-pythonpath.patch ./0004-drop-lsmod.patch ]; # Ignore the python version check because it hard-codes minor versions and # lags behind `ray`'s python interpreter support postPatch = '' postPatch = '' substituteInPlace CMakeLists.txt \ --replace-fail \ 'set(PYTHON_SUPPORTED_VERSIONS' \ 'set(PYTHON_SUPPORTED_VERSIONS "${lib.versions.majorMinor python.version}"' # Relax torch dependency manually because the nonstandard requirements format # is not caught by pythonRelaxDeps substituteInPlace requirements*.txt pyproject.toml \ --replace-warn 'torch==2.5.1' 'torch==${lib.getVersion torch}' \ --replace-warn 'torch == 2.5.1' 'torch == ${lib.getVersion torch}' '' + lib.optionalString (nccl == null) '' # On platforms where NCCL is not supported (e.g. Jetson), substitute Gloo (provided by Torch) substituteInPlace vllm/distributed/parallel_state.py \ --replace-fail '"nccl"' '"gloo"' ''; nativeBuildInputs = [ nativeBuildInputs = [ cmake ninja pythonRelaxDepsHook which ] ++ lib.optionals rocmSupport [ rocmPackages.hipcc ]; ] ++ lib.optionals rocmSupport [ rocmPackages.hipcc ] ++ lib.optionals cudaSupport [ cudaPackages.cuda_nvcc autoAddDriverRunpath ] ++ lib.optionals isCudaJetson [ cudaPackages.autoAddCudaCompatRunpath ]; build-system = [ packaging Loading @@ -97,18 +259,22 @@ buildPythonPackage rec { ]; buildInputs = (lib.optionals cudaSupport ( with cudaPackages; [ cuda_cudart # cuda_runtime.h, -lcudart cuda_cccl libcusparse # cusparse.h libcusolver # cusolverDn.h cuda_nvcc cuda_nvtx libcublas setuptools-scm torch ] )) ++ (lib.optionals cpuSupport ([ numactl oneDNN ])) ++ ( lib.optionals cudaSupport mergedCudaLibraries ++ (with cudaPackages; [ nccl cudnn libcufile ]) ) ++ (lib.optionals rocmSupport ( with rocmPackages; [ Loading @@ -123,10 +289,13 @@ buildPythonPackage rec { dependencies = [ aioprometheus blake3 depyf fastapi lm-format-enforcer numpy openai opencv-python-headless outlines pandas prometheus-fastapi-instrumentator Loading @@ -138,27 +307,64 @@ buildPythonPackage rec { ray sentencepiece tiktoken tokenizers msgspec gguf einops importlib-metadata partial-json-parser compressed-tensors mistral-common torch torchaudio torchvision transformers uvicorn xformers xgrammar ] ++ uvicorn.optional-dependencies.standard ++ aioprometheus.optional-dependencies.starlette ++ lib.optionals cudaSupport [ cupy pynvml ]; dontUseCmakeConfigure = true; cmakeFlags = [ (lib.cmakeFeature "FETCHCONTENT_SOURCE_DIR_CUTLASS" "${lib.getDev cutlass}") ]; cmakeFlags = [ (lib.cmakeFeature "FETCHCONTENT_SOURCE_DIR_CUTLASS" "${lib.getDev cutlass}") (lib.cmakeFeature "VLLM_FLASH_ATTN_SRC_DIR" "${lib.getDev vllm-flash-attn}") ] ++ lib.optionals cudaSupport [ (lib.cmakeFeature "TORCH_CUDA_ARCH_LIST" "${gpuTargetString}") (lib.cmakeFeature "CUTLASS_NVCC_ARCHS_ENABLED" "${cudaPackages.cudaFlags.cmakeCudaArchitecturesString }") (lib.cmakeFeature "CUDA_TOOLKIT_ROOT_DIR" "${symlinkJoin { name = "cuda-merged-${cudaPackages.cudaVersion}"; paths = builtins.concatMap getAllOutputs mergedCudaLibraries; }}") (lib.cmakeFeature "CAFFE2_USE_CUDNN" "ON") (lib.cmakeFeature "CAFFE2_USE_CUFILE" "ON") (lib.cmakeFeature "CUTLASS_ENABLE_CUBLAS" "ON") ] ++ lib.optionals cpuSupport [ (lib.cmakeFeature "FETCHCONTENT_SOURCE_DIR_ONEDNN" "${lib.getDev oneDNN}") ]; env = lib.optionalAttrs cudaSupport { CUDA_HOME = "${lib.getDev cudaPackages.cuda_nvcc}"; } lib.optionalAttrs cudaSupport { VLLM_TARGET_DEVICE = "cuda"; CUDA_HOME = "${lib.getDev cudaPackages.cuda_nvcc}"; } // lib.optionalAttrs rocmSupport { VLLM_TARGET_DEVICE = "rocm"; # Otherwise it tries to enumerate host supported ROCM gfx archs, and that is not possible due to sandboxing. PYTORCH_ROCM_ARCH = lib.strings.concatStringsSep ";" rocmPackages.clr.gpuTargets; ROCM_HOME = "${rocmPackages.clr}"; } // lib.optionalAttrs cpuSupport { VLLM_TARGET_DEVICE = "cpu"; }; pythonRelaxDeps = true; Loading @@ -174,8 +380,8 @@ buildPythonPackage rec { happysalada lach ]; # RuntimeError: Unknown runtime environment broken = true; # broken = !cudaSupport && !rocmSupport; # CPU support relies on unpackaged dependency `intel_extension_for_pytorch` broken = cpuSupport; }; }