Unverified Commit ab36ef17 authored by Pavol Rusnak's avatar Pavol Rusnak Committed by GitHub
Browse files

python3Packages.vllm: 0.6.2 -> 0.7.1 (#379165)

parents 498fb770 f8a07769
Loading
Loading
Loading
Loading
+2 −0
Original line number Diff line number Diff line
@@ -6,6 +6,7 @@
  numpy,
  pydantic,
  jsonschema,
  opencv-python-headless,
  sentencepiece,
  typing-extensions,
  tiktoken,
@@ -37,6 +38,7 @@ buildPythonPackage rec {
    numpy
    pydantic
    jsonschema
    opencv-python-headless
    sentencepiece
    typing-extensions
    tiktoken
+0 −24
Original line number Diff line number Diff line
From f6a7748bee79fc2e1898968fef844daacfa7860b Mon Sep 17 00:00:00 2001
From: SomeoneSerge <else@someonex.net>
Date: Wed, 31 Jul 2024 12:02:53 +0000
Subject: [PATCH 1/2] setup.py: don't ask for hipcc --version

---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index 72ef26f1..01e006f9 100644
--- a/setup.py
+++ b/setup.py
@@ -279,6 +279,7 @@ def _install_punica() -> bool:
 
 
 def get_hipcc_rocm_version():
+    return "0.0" # `hipcc --version` misbehaves ("unresolved paths") inside the nix sandbox
     # Run the hipcc --version command
     result = subprocess.run(['hipcc', '--version'],
                             stdout=subprocess.PIPE,
-- 
2.45.1
+12 −0
Original line number Diff line number Diff line
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index f5a02a5b..e830f987 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -482,6 +482,7 @@ def _run_in_subprocess(fn: Callable[[], _T]) -> _T:
         returned = subprocess.run(
             [sys.executable, "-m", "vllm.model_executor.models.registry"],
             input=input_bytes,
+            env={'PYTHONPATH': ':'.join(sys.path)},
             capture_output=True)
 
         # check if the subprocess is successful
+18 −0
Original line number Diff line number Diff line
--- a/setup.py
+++ b/setup.py
@@ -340,14 +340,7 @@ def _is_hpu() -> bool:
         out = subprocess.run(["hl-smi"], capture_output=True, check=True)
         is_hpu_available = out.returncode == 0
     except (FileNotFoundError, PermissionError, subprocess.CalledProcessError):
-        if sys.platform.startswith("linux"):
-            try:
-                output = subprocess.check_output(
-                    'lsmod | grep habanalabs | wc -l', shell=True)
-                is_hpu_available = int(output) > 0
-            except (ValueError, FileNotFoundError, PermissionError,
-                    subprocess.CalledProcessError):
-                pass
+        is_hpu_available = False
     return is_hpu_available
 
 
+249 −43
Original line number Diff line number Diff line
@@ -5,14 +5,21 @@
  buildPythonPackage,
  pythonRelaxDepsHook,
  fetchFromGitHub,
  symlinkJoin,
  autoAddDriverRunpath,

  # build system
  packaging,
  setuptools,
  wheel,

  # dependencies
  which,
  ninja,
  cmake,
  packaging,
  setuptools,
  setuptools-scm,
  torch,
  outlines,
  wheel,
  psutil,
  ray,
  pandas,
@@ -21,43 +28,174 @@
  numpy,
  transformers,
  xformers,
  xgrammar,
  fastapi,
  uvicorn,
  pydantic,
  aioprometheus,
  pynvml,
  openai,
  pyzmq,
  tiktoken,
  torchaudio,
  torchvision,
  py-cpuinfo,
  lm-format-enforcer,
  prometheus-fastapi-instrumentator,
  cupy,
  writeShellScript,
  gguf,
  einops,
  importlib-metadata,
  partial-json-parser,
  compressed-tensors,
  mistral-common,
  msgspec,
  numactl,
  tokenizers,
  oneDNN,
  blake3,
  depyf,
  opencv-python-headless,

  config,

  cudaSupport ? config.cudaSupport,
  cudaPackages ? { },

  # Has to be either rocm or cuda, default to the free one
  rocmSupport ? !config.cudaSupport,
  rocmSupport ? config.rocmSupport,
  rocmPackages ? { },
  gpuTargets ? [ ],
}@args:

let
  inherit (lib)
    lists
    strings
    trivial
    ;

  inherit (cudaPackages) cudaFlags;

  shouldUsePkg =
    pkg: if pkg != null && lib.meta.availableOn stdenv.hostPlatform pkg then pkg else null;

  # see CMakeLists.txt, grepping for GIT_TAG near cutlass
  # https://github.com/vllm-project/vllm/blob/${version}/CMakeLists.txt
  cutlass = fetchFromGitHub {
    owner = "NVIDIA";
    repo = "cutlass";
    rev = "refs/tags/v3.5.0";
    sha256 = "sha256-D/s7eYsa5l/mfx73tE4mnFcTQdYqGmXa9d9TCryw4e4=";
    tag = "v3.7.0";
    hash = "sha256-GUTRXmv3DiM/GN5Bvv2LYovMLKZMlMhoKv4O0g627gs=";
  };

  vllm-flash-attn = stdenv.mkDerivation rec {
    pname = "vllm-flash-attn";
    version = "2.6.2";

    # see CMakeLists.txt, grepping for GIT_TAG near vllm-flash-attn
    # https://github.com/vllm-project/vllm/blob/${version}/CMakeLists.txt
    src = fetchFromGitHub {
      owner = "vllm-project";
      repo = "flash-attention";
      rev = "d4e09037abf588af1ec47d0e966b237ee376876c";
      hash = "sha256-KFEsZlrwvCgvPzQ/pCLWcnbGq89mWE3yTDdtJSV9MII=";
    };

    dontConfigure = true;

    # vllm-flash-attn normally relies on `git submodule update` to fetch cutlass
    buildPhase = ''
      rm -rf csrc/cutlass
      ln -sf ${cutlass} csrc/cutlass
    '';

    installPhase = ''
      cp -rva . $out
    '';
  };

  cpuSupport = !cudaSupport && !rocmSupport;

  # https://github.com/pytorch/pytorch/blob/v2.4.0/torch/utils/cpp_extension.py#L1953
  supportedTorchCudaCapabilities =
    let
      real = [
        "3.5"
        "3.7"
        "5.0"
        "5.2"
        "5.3"
        "6.0"
        "6.1"
        "6.2"
        "7.0"
        "7.2"
        "7.5"
        "8.0"
        "8.6"
        "8.7"
        "8.9"
        "9.0"
        "9.0a"
      ];
      ptx = lists.map (x: "${x}+PTX") real;
    in
    real ++ ptx;

  # NOTE: The lists.subtractLists function is perhaps a bit unintuitive. It subtracts the elements
  #   of the first list *from* the second list. That means:
  #   lists.subtractLists a b = b - a

  # For CUDA
  supportedCudaCapabilities = lists.intersectLists cudaFlags.cudaCapabilities supportedTorchCudaCapabilities;
  unsupportedCudaCapabilities = lists.subtractLists supportedCudaCapabilities cudaFlags.cudaCapabilities;

  isCudaJetson = cudaSupport && cudaPackages.cudaFlags.isJetsonBuild;

  # Use trivial.warnIf to print a warning if any unsupported GPU targets are specified.
  gpuArchWarner =
    supported: unsupported:
    trivial.throwIf (supported == [ ]) (
      "No supported GPU targets specified. Requested GPU targets: "
      + strings.concatStringsSep ", " unsupported
    ) supported;

  # Create the gpuTargetString.
  gpuTargetString = strings.concatStringsSep ";" (
    if gpuTargets != [ ] then
      # If gpuTargets is specified, it always takes priority.
      gpuTargets
    else if cudaSupport then
      gpuArchWarner supportedCudaCapabilities unsupportedCudaCapabilities
    else if rocmSupport then
      rocmPackages.clr.gpuTargets
    else
      throw "No GPU targets specified"
  );

  mergedCudaLibraries = with cudaPackages; [
    cuda_cudart # cuda_runtime.h, -lcudart
    cuda_cccl
    libcusparse # cusparse.h
    libcusolver # cusolverDn.h
    cuda_nvtx
    cuda_nvrtc
    libcublas
  ];

  # Some packages are not available on all platforms
  nccl = shouldUsePkg (cudaPackages.nccl or null);

  getAllOutputs = p: [
    (lib.getBin p)
    (lib.getLib p)
    (lib.getDev p)
  ];

in

buildPythonPackage rec {
  pname = "vllm";
  version = "0.6.2";
  version = "0.7.1";
  pyproject = true;

  stdenv = if cudaSupport then cudaPackages.backendStdenv else args.stdenv;
@@ -65,30 +203,54 @@ buildPythonPackage rec {
  src = fetchFromGitHub {
    owner = "vllm-project";
    repo = pname;
    rev = "refs/tags/v${version}";
    hash = "sha256-zUkqAPPhDRdN9rDQ2biCl1B+trV0xIHXub++v9zsQGo=";
    tag = "v${version}";
    hash = "sha256-CImXKMEv+jHqngvcr8W6fQLiCo1mqmcZ0Ho0bfAgfbg=";
  };

  patches = [
    ./0001-setup.py-don-t-ask-for-hipcc-version.patch
    ./0002-setup.py-nix-support-respect-cmakeFlags.patch
    ./0003-propagate-pythonpath.patch
    ./0004-drop-lsmod.patch
  ];

  # Ignore the python version check because it hard-codes minor versions and
  # lags behind `ray`'s python interpreter support
  postPatch = ''
  postPatch =
    ''
      substituteInPlace CMakeLists.txt \
        --replace-fail \
          'set(PYTHON_SUPPORTED_VERSIONS' \
          'set(PYTHON_SUPPORTED_VERSIONS "${lib.versions.majorMinor python.version}"'

      # Relax torch dependency manually because the nonstandard requirements format
      # is not caught by pythonRelaxDeps
      substituteInPlace requirements*.txt pyproject.toml \
        --replace-warn 'torch==2.5.1' 'torch==${lib.getVersion torch}' \
        --replace-warn 'torch == 2.5.1' 'torch == ${lib.getVersion torch}'
    ''
    + lib.optionalString (nccl == null) ''
      # On platforms where NCCL is not supported (e.g. Jetson), substitute Gloo (provided by Torch)
      substituteInPlace vllm/distributed/parallel_state.py \
        --replace-fail '"nccl"' '"gloo"'
    '';

  nativeBuildInputs = [
  nativeBuildInputs =
    [
      cmake
      ninja
      pythonRelaxDepsHook
      which
  ] ++ lib.optionals rocmSupport [ rocmPackages.hipcc ];
    ]
    ++ lib.optionals rocmSupport [
      rocmPackages.hipcc
    ]
    ++ lib.optionals cudaSupport [
      cudaPackages.cuda_nvcc
      autoAddDriverRunpath
    ]
    ++ lib.optionals isCudaJetson [
      cudaPackages.autoAddCudaCompatRunpath
    ];

  build-system = [
    packaging
@@ -97,18 +259,22 @@ buildPythonPackage rec {
  ];

  buildInputs =
    (lib.optionals cudaSupport (
      with cudaPackages;
    [
        cuda_cudart # cuda_runtime.h, -lcudart
        cuda_cccl
        libcusparse # cusparse.h
        libcusolver # cusolverDn.h
        cuda_nvcc
        cuda_nvtx
        libcublas
      setuptools-scm
      torch
    ]
    ))
    ++ (lib.optionals cpuSupport ([
      numactl
      oneDNN
    ]))
    ++ (
      lib.optionals cudaSupport mergedCudaLibraries
      ++ (with cudaPackages; [
        nccl
        cudnn
        libcufile
      ])
    )
    ++ (lib.optionals rocmSupport (
      with rocmPackages;
      [
@@ -123,10 +289,13 @@ buildPythonPackage rec {
  dependencies =
    [
      aioprometheus
      blake3
      depyf
      fastapi
      lm-format-enforcer
      numpy
      openai
      opencv-python-headless
      outlines
      pandas
      prometheus-fastapi-instrumentator
@@ -138,27 +307,64 @@ buildPythonPackage rec {
      ray
      sentencepiece
      tiktoken
      tokenizers
      msgspec
      gguf
      einops
      importlib-metadata
      partial-json-parser
      compressed-tensors
      mistral-common
      torch
      torchaudio
      torchvision
      transformers
      uvicorn
      xformers
      xgrammar
    ]
    ++ uvicorn.optional-dependencies.standard
    ++ aioprometheus.optional-dependencies.starlette
    ++ lib.optionals cudaSupport [
      cupy
      pynvml
    ];

  dontUseCmakeConfigure = true;
  cmakeFlags = [ (lib.cmakeFeature "FETCHCONTENT_SOURCE_DIR_CUTLASS" "${lib.getDev cutlass}") ];
  cmakeFlags =
    [
      (lib.cmakeFeature "FETCHCONTENT_SOURCE_DIR_CUTLASS" "${lib.getDev cutlass}")
      (lib.cmakeFeature "VLLM_FLASH_ATTN_SRC_DIR" "${lib.getDev vllm-flash-attn}")
    ]
    ++ lib.optionals cudaSupport [
      (lib.cmakeFeature "TORCH_CUDA_ARCH_LIST" "${gpuTargetString}")
      (lib.cmakeFeature "CUTLASS_NVCC_ARCHS_ENABLED" "${cudaPackages.cudaFlags.cmakeCudaArchitecturesString
      }")
      (lib.cmakeFeature "CUDA_TOOLKIT_ROOT_DIR" "${symlinkJoin {
        name = "cuda-merged-${cudaPackages.cudaVersion}";
        paths = builtins.concatMap getAllOutputs mergedCudaLibraries;
      }}")
      (lib.cmakeFeature "CAFFE2_USE_CUDNN" "ON")
      (lib.cmakeFeature "CAFFE2_USE_CUFILE" "ON")
      (lib.cmakeFeature "CUTLASS_ENABLE_CUBLAS" "ON")
    ]
    ++ lib.optionals cpuSupport [
      (lib.cmakeFeature "FETCHCONTENT_SOURCE_DIR_ONEDNN" "${lib.getDev oneDNN}")
    ];

  env =
    lib.optionalAttrs cudaSupport { CUDA_HOME = "${lib.getDev cudaPackages.cuda_nvcc}"; }
    lib.optionalAttrs cudaSupport {
      VLLM_TARGET_DEVICE = "cuda";
      CUDA_HOME = "${lib.getDev cudaPackages.cuda_nvcc}";
    }
    // lib.optionalAttrs rocmSupport {
      VLLM_TARGET_DEVICE = "rocm";
      # Otherwise it tries to enumerate host supported ROCM gfx archs, and that is not possible due to sandboxing.
      PYTORCH_ROCM_ARCH = lib.strings.concatStringsSep ";" rocmPackages.clr.gpuTargets;
      ROCM_HOME = "${rocmPackages.clr}";
    }
    // lib.optionalAttrs cpuSupport {
      VLLM_TARGET_DEVICE = "cpu";
    };

  pythonRelaxDeps = true;
@@ -174,8 +380,8 @@ buildPythonPackage rec {
      happysalada
      lach
    ];
    # RuntimeError: Unknown runtime environment
    broken = true;
    # broken = !cudaSupport && !rocmSupport;

    # CPU support relies on unpackaged dependency `intel_extension_for_pytorch`
    broken = cpuSupport;
  };
}
Loading