python3Packages.vllm: 0.6.2 -> 0.7.1 (#379165) (ab36ef17) · Commits · nix / nixpkgs

pkgs/development/python-modules/mistral-common/default.nix

+2 −0

Original line number	Diff line number	Diff line
		@@ -6,6 +6,7 @@
		numpy,
		pydantic,
		jsonschema,
		opencv-python-headless,
		sentencepiece,
		typing-extensions,
		tiktoken,
		@@ -37,6 +38,7 @@ buildPythonPackage rec {
		numpy
		pydantic
		jsonschema
		opencv-python-headless
		sentencepiece
		typing-extensions
		tiktoken

pkgs/development/python-modules/vllm/0001-setup.py-don-t-ask-for-hipcc-version.patch

deleted100644 → 0

+0 −24

Original line number	Diff line number	Diff line
		From f6a7748bee79fc2e1898968fef844daacfa7860b Mon Sep 17 00:00:00 2001
		From: SomeoneSerge <else@someonex.net>
		Date: Wed, 31 Jul 2024 12:02:53 +0000
		Subject: [PATCH 1/2] setup.py: don't ask for hipcc --version

		---
		setup.py \| 1 +
		1 file changed, 1 insertion(+)

		diff --git a/setup.py b/setup.py
		index 72ef26f1..01e006f9 100644
		--- a/setup.py
		+++ b/setup.py
		@@ -279,6 +279,7 @@ def _install_punica() -> bool:


		def get_hipcc_rocm_version():
		+ return "0.0" # `hipcc --version` misbehaves ("unresolved paths") inside the nix sandbox
		# Run the hipcc --version command
		result = subprocess.run(['hipcc', '--version'],
		stdout=subprocess.PIPE,
		--
		2.45.1

pkgs/development/python-modules/vllm/0003-propagate-pythonpath.patch

0 → 100644

+12 −0

Original line number	Diff line number	Diff line
		diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
		index f5a02a5b..e830f987 100644
		--- a/vllm/model_executor/models/registry.py
		+++ b/vllm/model_executor/models/registry.py
		@@ -482,6 +482,7 @@ def _run_in_subprocess(fn: Callable[[], _T]) -> _T:
		returned = subprocess.run(
		[sys.executable, "-m", "vllm.model_executor.models.registry"],
		input=input_bytes,
		+ env={'PYTHONPATH': ':'.join(sys.path)},
		capture_output=True)

		# check if the subprocess is successful

pkgs/development/python-modules/vllm/0004-drop-lsmod.patch

0 → 100644

+18 −0

Original line number	Diff line number	Diff line
		--- a/setup.py
		+++ b/setup.py
		@@ -340,14 +340,7 @@ def _is_hpu() -> bool:
		out = subprocess.run(["hl-smi"], capture_output=True, check=True)
		is_hpu_available = out.returncode == 0
		except (FileNotFoundError, PermissionError, subprocess.CalledProcessError):
		- if sys.platform.startswith("linux"):
		- try:
		- output = subprocess.check_output(
		- 'lsmod \| grep habanalabs \| wc -l', shell=True)
		- is_hpu_available = int(output) > 0
		- except (ValueError, FileNotFoundError, PermissionError,
		- subprocess.CalledProcessError):
		- pass
		+ is_hpu_available = False
		return is_hpu_available

pkgs/development/python-modules/vllm/default.nix

+249 −43

Original line number	Diff line number	Diff line
		@@ -5,14 +5,21 @@
		buildPythonPackage,
		pythonRelaxDepsHook,
		fetchFromGitHub,
		symlinkJoin,
		autoAddDriverRunpath,

		# build system
		packaging,
		setuptools,
		wheel,

		# dependencies
		which,
		ninja,
		cmake,
		packaging,
		setuptools,
		setuptools-scm,
		torch,
		outlines,
		wheel,
		psutil,
		ray,
		pandas,
		@@ -21,43 +28,174 @@
		numpy,
		transformers,
		xformers,
		xgrammar,
		fastapi,
		uvicorn,
		pydantic,
		aioprometheus,
		pynvml,
		openai,
		pyzmq,
		tiktoken,
		torchaudio,
		torchvision,
		py-cpuinfo,
		lm-format-enforcer,
		prometheus-fastapi-instrumentator,
		cupy,
		writeShellScript,
		gguf,
		einops,
		importlib-metadata,
		partial-json-parser,
		compressed-tensors,
		mistral-common,
		msgspec,
		numactl,
		tokenizers,
		oneDNN,
		blake3,
		depyf,
		opencv-python-headless,

		config,

		cudaSupport ? config.cudaSupport,
		cudaPackages ? { },

		# Has to be either rocm or cuda, default to the free one
		rocmSupport ? !config.cudaSupport,
		rocmSupport ? config.rocmSupport,
		rocmPackages ? { },
		gpuTargets ? [ ],
		}@args:

		let
		inherit (lib)
		lists
		strings
		trivial
		;

		inherit (cudaPackages) cudaFlags;

		shouldUsePkg =
		pkg: if pkg != null && lib.meta.availableOn stdenv.hostPlatform pkg then pkg else null;

		# see CMakeLists.txt, grepping for GIT_TAG near cutlass
		# https://github.com/vllm-project/vllm/blob/${version}/CMakeLists.txt
		cutlass = fetchFromGitHub {
		owner = "NVIDIA";
		repo = "cutlass";
		rev = "refs/tags/v3.5.0";
		sha256 = "sha256-D/s7eYsa5l/mfx73tE4mnFcTQdYqGmXa9d9TCryw4e4=";
		tag = "v3.7.0";
		hash = "sha256-GUTRXmv3DiM/GN5Bvv2LYovMLKZMlMhoKv4O0g627gs=";
		};

		vllm-flash-attn = stdenv.mkDerivation rec {
		pname = "vllm-flash-attn";
		version = "2.6.2";

		# see CMakeLists.txt, grepping for GIT_TAG near vllm-flash-attn
		# https://github.com/vllm-project/vllm/blob/${version}/CMakeLists.txt
		src = fetchFromGitHub {
		owner = "vllm-project";
		repo = "flash-attention";
		rev = "d4e09037abf588af1ec47d0e966b237ee376876c";
		hash = "sha256-KFEsZlrwvCgvPzQ/pCLWcnbGq89mWE3yTDdtJSV9MII=";
		};

		dontConfigure = true;

		# vllm-flash-attn normally relies on `git submodule update` to fetch cutlass
		buildPhase = ''
		rm -rf csrc/cutlass
		ln -sf ${cutlass} csrc/cutlass
		'';

		installPhase = ''
		cp -rva . $out
		'';
		};

		cpuSupport = !cudaSupport && !rocmSupport;

		# https://github.com/pytorch/pytorch/blob/v2.4.0/torch/utils/cpp_extension.py#L1953
		supportedTorchCudaCapabilities =
		let
		real = [
		"3.5"
		"3.7"
		"5.0"
		"5.2"
		"5.3"
		"6.0"
		"6.1"
		"6.2"
		"7.0"
		"7.2"
		"7.5"
		"8.0"
		"8.6"
		"8.7"
		"8.9"
		"9.0"
		"9.0a"
		];
		ptx = lists.map (x: "${x}+PTX") real;
		in
		real ++ ptx;

		# NOTE: The lists.subtractLists function is perhaps a bit unintuitive. It subtracts the elements
		# of the first list from the second list. That means:
		# lists.subtractLists a b = b - a

		# For CUDA
		supportedCudaCapabilities = lists.intersectLists cudaFlags.cudaCapabilities supportedTorchCudaCapabilities;
		unsupportedCudaCapabilities = lists.subtractLists supportedCudaCapabilities cudaFlags.cudaCapabilities;

		isCudaJetson = cudaSupport && cudaPackages.cudaFlags.isJetsonBuild;

		# Use trivial.warnIf to print a warning if any unsupported GPU targets are specified.
		gpuArchWarner =
		supported: unsupported:
		trivial.throwIf (supported == [ ]) (
		"No supported GPU targets specified. Requested GPU targets: "
		+ strings.concatStringsSep ", " unsupported
		) supported;

		# Create the gpuTargetString.
		gpuTargetString = strings.concatStringsSep ";" (
		if gpuTargets != [ ] then
		# If gpuTargets is specified, it always takes priority.
		gpuTargets
		else if cudaSupport then
		gpuArchWarner supportedCudaCapabilities unsupportedCudaCapabilities
		else if rocmSupport then
		rocmPackages.clr.gpuTargets
		else
		throw "No GPU targets specified"
		);

		mergedCudaLibraries = with cudaPackages; [
		cuda_cudart # cuda_runtime.h, -lcudart
		cuda_cccl
		libcusparse # cusparse.h
		libcusolver # cusolverDn.h
		cuda_nvtx
		cuda_nvrtc
		libcublas
		];

		# Some packages are not available on all platforms
		nccl = shouldUsePkg (cudaPackages.nccl or null);

		getAllOutputs = p: [
		(lib.getBin p)
		(lib.getLib p)
		(lib.getDev p)
		];

		in

		buildPythonPackage rec {
		pname = "vllm";
		version = "0.6.2";
		version = "0.7.1";
		pyproject = true;

		stdenv = if cudaSupport then cudaPackages.backendStdenv else args.stdenv;
		@@ -65,30 +203,54 @@ buildPythonPackage rec {
		src = fetchFromGitHub {
		owner = "vllm-project";
		repo = pname;
		rev = "refs/tags/v${version}";
		hash = "sha256-zUkqAPPhDRdN9rDQ2biCl1B+trV0xIHXub++v9zsQGo=";
		tag = "v${version}";
		hash = "sha256-CImXKMEv+jHqngvcr8W6fQLiCo1mqmcZ0Ho0bfAgfbg=";
		};

		patches = [
		./0001-setup.py-don-t-ask-for-hipcc-version.patch
		./0002-setup.py-nix-support-respect-cmakeFlags.patch
		./0003-propagate-pythonpath.patch
		./0004-drop-lsmod.patch
		];

		# Ignore the python version check because it hard-codes minor versions and
		# lags behind `ray`'s python interpreter support
		postPatch = ''
		postPatch =
		''
		substituteInPlace CMakeLists.txt \
		--replace-fail \
		'set(PYTHON_SUPPORTED_VERSIONS' \
		'set(PYTHON_SUPPORTED_VERSIONS "${lib.versions.majorMinor python.version}"'

		# Relax torch dependency manually because the nonstandard requirements format
		# is not caught by pythonRelaxDeps
		substituteInPlace requirements*.txt pyproject.toml \
		--replace-warn 'torch==2.5.1' 'torch==${lib.getVersion torch}' \
		--replace-warn 'torch == 2.5.1' 'torch == ${lib.getVersion torch}'
		''
		+ lib.optionalString (nccl == null) ''
		# On platforms where NCCL is not supported (e.g. Jetson), substitute Gloo (provided by Torch)
		substituteInPlace vllm/distributed/parallel_state.py \
		--replace-fail '"nccl"' '"gloo"'
		'';

		nativeBuildInputs = [
		nativeBuildInputs =
		[
		cmake
		ninja
		pythonRelaxDepsHook
		which
		] ++ lib.optionals rocmSupport [ rocmPackages.hipcc ];
		]
		++ lib.optionals rocmSupport [
		rocmPackages.hipcc
		]
		++ lib.optionals cudaSupport [
		cudaPackages.cuda_nvcc
		autoAddDriverRunpath
		]
		++ lib.optionals isCudaJetson [
		cudaPackages.autoAddCudaCompatRunpath
		];

		build-system = [
		packaging
		@@ -97,18 +259,22 @@ buildPythonPackage rec {
		];

		buildInputs =
		(lib.optionals cudaSupport (
		with cudaPackages;
		[
		cuda_cudart # cuda_runtime.h, -lcudart
		cuda_cccl
		libcusparse # cusparse.h
		libcusolver # cusolverDn.h
		cuda_nvcc
		cuda_nvtx
		libcublas
		setuptools-scm
		torch
		]
		))
		++ (lib.optionals cpuSupport ([
		numactl
		oneDNN
		]))
		++ (
		lib.optionals cudaSupport mergedCudaLibraries
		++ (with cudaPackages; [
		nccl
		cudnn
		libcufile
		])
		)
		++ (lib.optionals rocmSupport (
		with rocmPackages;
		[
		@@ -123,10 +289,13 @@ buildPythonPackage rec {
		dependencies =
		[
		aioprometheus
		blake3
		depyf
		fastapi
		lm-format-enforcer
		numpy
		openai
		opencv-python-headless
		outlines
		pandas
		prometheus-fastapi-instrumentator
		@@ -138,27 +307,64 @@ buildPythonPackage rec {
		ray
		sentencepiece
		tiktoken
		tokenizers
		msgspec
		gguf
		einops
		importlib-metadata
		partial-json-parser
		compressed-tensors
		mistral-common
		torch
		torchaudio
		torchvision
		transformers
		uvicorn
		xformers
		xgrammar
		]
		++ uvicorn.optional-dependencies.standard
		++ aioprometheus.optional-dependencies.starlette
		++ lib.optionals cudaSupport [
		cupy
		pynvml
		];

		dontUseCmakeConfigure = true;
		cmakeFlags = [ (lib.cmakeFeature "FETCHCONTENT_SOURCE_DIR_CUTLASS" "${lib.getDev cutlass}") ];
		cmakeFlags =
		[
		(lib.cmakeFeature "FETCHCONTENT_SOURCE_DIR_CUTLASS" "${lib.getDev cutlass}")
		(lib.cmakeFeature "VLLM_FLASH_ATTN_SRC_DIR" "${lib.getDev vllm-flash-attn}")
		]
		++ lib.optionals cudaSupport [
		(lib.cmakeFeature "TORCH_CUDA_ARCH_LIST" "${gpuTargetString}")
		(lib.cmakeFeature "CUTLASS_NVCC_ARCHS_ENABLED" "${cudaPackages.cudaFlags.cmakeCudaArchitecturesString
		}")
		(lib.cmakeFeature "CUDA_TOOLKIT_ROOT_DIR" "${symlinkJoin {
		name = "cuda-merged-${cudaPackages.cudaVersion}";
		paths = builtins.concatMap getAllOutputs mergedCudaLibraries;
		}}")
		(lib.cmakeFeature "CAFFE2_USE_CUDNN" "ON")
		(lib.cmakeFeature "CAFFE2_USE_CUFILE" "ON")
		(lib.cmakeFeature "CUTLASS_ENABLE_CUBLAS" "ON")
		]
		++ lib.optionals cpuSupport [
		(lib.cmakeFeature "FETCHCONTENT_SOURCE_DIR_ONEDNN" "${lib.getDev oneDNN}")
		];

		env =
		lib.optionalAttrs cudaSupport { CUDA_HOME = "${lib.getDev cudaPackages.cuda_nvcc}"; }
		lib.optionalAttrs cudaSupport {
		VLLM_TARGET_DEVICE = "cuda";
		CUDA_HOME = "${lib.getDev cudaPackages.cuda_nvcc}";
		}
		// lib.optionalAttrs rocmSupport {
		VLLM_TARGET_DEVICE = "rocm";
		# Otherwise it tries to enumerate host supported ROCM gfx archs, and that is not possible due to sandboxing.
		PYTORCH_ROCM_ARCH = lib.strings.concatStringsSep ";" rocmPackages.clr.gpuTargets;
		ROCM_HOME = "${rocmPackages.clr}";
		}
		// lib.optionalAttrs cpuSupport {
		VLLM_TARGET_DEVICE = "cpu";
		};

		pythonRelaxDeps = true;
		@@ -174,8 +380,8 @@ buildPythonPackage rec {
		happysalada
		lach
		];
		# RuntimeError: Unknown runtime environment
		broken = true;
		# broken = !cudaSupport && !rocmSupport;

		# CPU support relies on unpackaged dependency `intel_extension_for_pytorch`
		broken = cpuSupport;
		};
		}