Unverified Commit 811c0af5 authored by Emily's avatar Emily Committed by GitHub
Browse files

dcgm: 3.3.5 -> 3.3.9; cudaPackages_10{,_0,_1,_2}: drop (#357655)

parents 7baa9f1e e3d26a18
Loading
Loading
Loading
Loading
+2 −0
Original line number Diff line number Diff line
@@ -28,6 +28,8 @@
    After you run ALTER EXTENSION, you must run [this SQL script](https://github.com/timescale/timescaledb-extras/blob/master/utils/2.15.X-fix_hypertable_foreign_keys.sql). For more details, see the following pull requests [#6797](https://github.com/timescale/timescaledb/pull/6797).
    PostgreSQL 13 is no longer supported in TimescaleDB v2.16.

- Support for CUDA 10 has been dropped, as announced in the 24.11 release notes.

- `kanata` was updated to v1.7.0, which introduces several breaking changes.
  See the release notes of
  [v1.7.0](https://github.com/jtroo/kanata/releases/tag/v1.7.0)
+2 −28
Original line number Diff line number Diff line
@@ -14,23 +14,11 @@
, Accelerate, CoreGraphics, CoreVideo
, lmdbSupport ? true, lmdb
, leveldbSupport ? true, leveldb, snappy
, cudaSupport ? config.cudaSupport, cudaPackages ? { }
, cudnnSupport ? cudaSupport
, ncclSupport ? false
, pythonSupport ? false, python ? null, numpy ? null
, substituteAll
}:

let
  inherit (cudaPackages) backendStdenv cudatoolkit nccl;
  # The default for cudatoolkit 10.1 is CUDNN 8.0.5, the last version to support CUDA 10.1.
  # However, this caffe does not build with CUDNN 8.x, so we use CUDNN 7.6.5 instead.
  # Earlier versions of cudatoolkit use pre-8.x CUDNN, so we use the default.
  hasCudnn =
    if lib.versionOlder cudatoolkit.version "10.1"
    then cudaPackages ? cudnn
    else cudaPackages ? cudnn_7_6;

  toggle = bool: if bool then "ON" else "OFF";

  test_model_weights = fetchurl {
@@ -57,20 +45,12 @@ stdenv.mkDerivation rec {
    # boost_python expects
    [ (if pythonSupport then "-Dpython_version=${python.pythonVersion}" else "-DBUILD_python=OFF")
      "-DBLAS=open"
    ] ++ (if cudaSupport then [
           "-DCUDA_ARCH_NAME=All"
           "-DCUDA_HOST_COMPILER=${backendStdenv.cc}/bin/cc"
         ] else [ "-DCPU_ONLY=ON" ])
      ++ ["-DUSE_NCCL=${toggle ncclSupport}"]
      ++ ["-DUSE_LEVELDB=${toggle leveldbSupport}"]
      "-DCPU_ONLY=ON"
    ] ++ ["-DUSE_LEVELDB=${toggle leveldbSupport}"]
      ++ ["-DUSE_LMDB=${toggle lmdbSupport}"];

  buildInputs = [ boost gflags glog protobuf hdf5-cpp opencv4 blas ]
                ++ lib.optional cudaSupport cudatoolkit
                ++ lib.optional (lib.versionOlder cudatoolkit.version "10.1" && hasCudnn) cudaPackages.cudnn
                ++ lib.optional (lib.versionAtLeast cudatoolkit.version "10.1" && hasCudnn) cudaPackages.cudnn_7_6
                ++ lib.optional lmdbSupport lmdb
                ++ lib.optional ncclSupport nccl
                ++ lib.optionals leveldbSupport [ leveldb snappy ]
                ++ lib.optionals pythonSupport [ python numpy ]
                ++ lib.optionals stdenv.hostPlatform.isDarwin [ Accelerate CoreGraphics CoreVideo ]
@@ -105,9 +85,6 @@ stdenv.mkDerivation rec {
    substituteInPlace src/caffe/util/io.cpp --replace \
      'SetTotalBytesLimit(kProtoReadBytesLimit, 536870912)' \
      'SetTotalBytesLimit(kProtoReadBytesLimit)'
  '' + lib.optionalString (cudaSupport && lib.versionAtLeast cudatoolkit.version "9.0") ''
    # CUDA 9.0 doesn't support sm_20
    sed -i 's,20 21(20) ,,' cmake/Cuda.cmake
  '';

  preConfigure = lib.optionalString pythonSupport ''
@@ -150,10 +127,7 @@ stdenv.mkDerivation rec {
    maintainers = [ ];
    broken =
      (pythonSupport && (python.isPy310))
      || cudaSupport
      || !(leveldbSupport -> (leveldb != null && snappy != null))
      || !(cudnnSupport -> (hasCudnn && cudaSupport))
      || !(ncclSupport -> (cudaSupport && !nccl.meta.unsupported))
      || !(pythonSupport -> (python != null && numpy != null))
    ;
    license = licenses.bsd2;
+40 −0
Original line number Diff line number Diff line
diff --git a/cmake/FindJsoncpp.cmake b/cmake/FindJsoncpp.cmake
index abedf7bb85..d7a52f5c7b 100644
--- a/cmake/FindJsoncpp.cmake
+++ b/cmake/FindJsoncpp.cmake
@@ -16,8 +16,8 @@
 
 if (NOT TARGET JsonCpp::JsonCpp)
     find_package(jsoncpp REQUIRED CONFIG)
-    set(JSONCPP_STATIC_LIBS jsoncpp_static)
-    set(JSONCPP_INCLUDE_PATH $<TARGET_PROPERTY:jsoncpp_static,INTERFACE_INCLUDE_DIRECTORIES>)
+    set(JSONCPP_STATIC_LIBS jsoncpp)
+    set(JSONCPP_INCLUDE_PATH $<TARGET_PROPERTY:jsoncpp,INTERFACE_INCLUDE_DIRECTORIES>)
 endif()
 # set(Jsoncpp_PATH_PREFIXES /usr/local "${Jsoncpp_ROOT}" "$ENV{HOME}")
 # foreach(prefix ${Jsoncpp_PATH_PREFIXES})
diff --git a/cmake/FindLibevent.cmake b/cmake/FindLibevent.cmake
index 354d6f9b99..38aca9534e 100644
--- a/cmake/FindLibevent.cmake
+++ b/cmake/FindLibevent.cmake
@@ -26,16 +26,12 @@
 endforeach()
 
 find_path(LIBEVENT_INCLUDE_DIR evhttp.h event.h PATHS ${Libevent_INCLUDE_PATHS})
-find_library(LIBEVENT_STATIC_LIB NAMES libevent.a libevent_core.a libevent_extra.a PATHS ${Libevent_LIB_PATHS})
-find_library(LIBEVENT_PTHREAD_STATIC_LIB NAMES libevent_pthreads.a PATHS ${Libevent_LIB_PATHS})
+find_library(LIBEVENT_STATIC_LIB NAMES libevent.so libevent_core.so libevent_extra.so PATHS ${Libevent_LIB_PATHS})
+find_library(LIBEVENT_PTHREAD_STATIC_LIB NAMES libevent_pthreads.so PATHS ${Libevent_LIB_PATHS})
 
-if (LIBEVENT_INCLUDE_DIR AND LIBEVENT_STATIC_LIB AND LIBEVENT_PTHREAD_STATIC_LIB)
+if (LIBEVENT_INCLUDE_DIR AND LIBEVENT_STATIC_LIB)
     set(Libevent_FOUND TRUE)
-    add_library(libevent_event_static STATIC IMPORTED)
-    set_target_properties(libevent_event_static PROPERTIES IMPORTED_LOCATION ${LIBEVENT_STATIC_LIB})
-    add_library(libevent_event_pthread STATIC IMPORTED)
-    set_target_properties(libevent_event_pthread PROPERTIES IMPORTED_LOCATION ${LIBEVENT_PTHREAD_STATIC_LIB})
-    set(LIBEVENT_STATIC_LIBS libevent_event_static libevent_event_pthread)
+    set(LIBEVENT_STATIC_LIBS ${LIBEVENT_STATIC_LIB} ${LIBEVENT_PTHREAD_STATIC_LIB})
 else ()
     set(Libevent_FOUND FALSE)
 endif ()
+110 −0
Original line number Diff line number Diff line
diff --git a/common/CudaWorker/DcgmDgemm.cpp b/common/CudaWorker/DcgmDgemm.cpp
index 8d33a3256e..6b3284258d 100644
--- a/common/CudaWorker/DcgmDgemm.cpp
+++ b/common/CudaWorker/DcgmDgemm.cpp
@@ -17,6 +17,7 @@
 
 #include <exception>
 #include <stdexcept>
+#include <cinttypes>
 
 #define CU_CHK(op)                                               \
     if (auto const status = op; status != CUBLAS_STATUS_SUCCESS) \
@@ -122,4 +123,4 @@
     return CUBLAS_STATUS_SUCCESS;
 }
 
-} // namespace DcgmNs
\ No newline at end of file
+} // namespace DcgmNs
diff --git a/common/DcgmError.h b/common/DcgmError.h
index 8638cdceb1..e8d817c0d4 100644
--- a/common/DcgmError.h
+++ b/common/DcgmError.h
@@ -17,6 +17,7 @@
 
 #include <sstream>
 #include <string>
+#include <cinttypes>
 
 #include <dcgm_agent.h>
 #include <dcgm_errors.h>
diff --git a/common/DcgmStringHelpers.cpp b/common/DcgmStringHelpers.cpp
index b41917e3b7..1fe63980c7 100644
--- a/common/DcgmStringHelpers.cpp
+++ b/common/DcgmStringHelpers.cpp
@@ -17,6 +17,7 @@
 
 #include <cstring>
 #include <string>
+#include <algorithm>
 
 /*****************************************************************************/
 void dcgmTokenizeString(const std::string &src, const std::string &delimiter, std::vector<std::string> &tokens)
diff --git a/dcgmi/CommandOutputController.cpp b/dcgmi/CommandOutputController.cpp
index 5057205564..8520171efa 100644
--- a/dcgmi/CommandOutputController.cpp
+++ b/dcgmi/CommandOutputController.cpp
@@ -24,6 +24,7 @@
 #include "dcgm_agent.h"
 #include <DcgmStringHelpers.h>
 #include <algorithm>
+#include <functional>
 #include <cstdarg>
 #include <iostream>
 #include <string>
diff --git a/dcgmi/Diag.h b/dcgmi/Diag.h
index a326f7b949..563fb3c9c0 100755
--- a/dcgmi/Diag.h
+++ b/dcgmi/Diag.h
@@ -24,6 +24,7 @@
 #define DIAG_H_
 
 #include <optional>
+#include <functional>
 
 #include "Command.h"
 #include "CommandOutputController.h"
diff --git a/hostengine/src/HostEngineOutput.cpp b/hostengine/src/HostEngineOutput.cpp
index 23c6ca9f54..798b83b3e4 100644
--- a/hostengine/src/HostEngineOutput.cpp
+++ b/hostengine/src/HostEngineOutput.cpp
@@ -20,6 +20,7 @@
 #include <iostream>
 #include <limits>
 #include <string_view>
+#include <unordered_map>
 
 namespace
 {
@@ -365,4 +366,4 @@
         }
     }
     os << std::endl;
-}
\ No newline at end of file
+}
diff --git a/nvvs/src/NvvsCommon.cpp b/nvvs/src/NvvsCommon.cpp
index 8f7888649b..1604d9dabe 100644
--- a/nvvs/src/NvvsCommon.cpp
+++ b/nvvs/src/NvvsCommon.cpp
@@ -15,6 +15,7 @@
  */
 #include <sstream>
 #include <stdexcept>
+#include <algorithm>
 #include <sys/stat.h>
 #include <sys/types.h>
 
diff --git a/sdk/nvidia/nvml/nvml_loader/nvml_loader.cpp b/sdk/nvidia/nvml/nvml_loader/nvml_loader.cpp
index 9eebeaf1c4..6e21201229 100644
--- a/sdk/nvidia/nvml/nvml_loader/nvml_loader.cpp
+++ b/sdk/nvidia/nvml/nvml_loader/nvml_loader.cpp
@@ -20,6 +20,7 @@
 
 #include <atomic>
 #include <mutex>
+#include <cstdlib>
 
 static void *g_nvmlLib                                     = 0;
 static std::atomic_uint32_t g_nvmlStaticLibResetHooksCount = 1;
+68 −58
Original line number Diff line number Diff line
{ lib
, gcc11Stdenv
, stdenv
, fetchFromGitHub
, autoAddDriverRunpath
, catch2
, cmake
, cudaPackages_10_2
, ninja
, cudaPackages_11_8
, cudaPackages_12
, boost
, fmt_9
, git
, jsoncpp
@@ -16,30 +17,14 @@
, symlinkJoin
, tclap_1_4
, yaml-cpp

, static ? gcc11Stdenv.hostPlatform.isStatic
}:
let
  # DCGM depends on 3 different versions of CUDA at the same time.
  # The runtime closure, thankfully, is quite small because most things
  # are statically linked.
  cudaPackageSetByVersion = [
    {
      version = "10";
      # Nixpkgs cudaPackages_10 doesn't have redist packages broken out.
      pkgSet = [
        cudaPackages_10_2.cudatoolkit
        cudaPackages_10_2.cudatoolkit.lib
      ];
    }
    {
      version = "11";
      pkgSet = getCudaPackages cudaPackages_11_8;
    }
    {
      version = "12";
      pkgSet = getCudaPackages cudaPackages_12;
    }
  # DCGM depends on 2 different versions of CUDA at the same time.
  # The runtime closure, thankfully, is quite small as it does not
  # include the CUDA libraries.
  cudaPackageSets = [
    cudaPackages_11_8
    cudaPackages_12
  ];

  # Select needed redist packages from cudaPackages
@@ -54,44 +39,40 @@ let
    libcurand
  ];

  # Builds CMake code to add CUDA paths for include and lib.
  mkAppendCudaPaths = { version, pkgSet }:
  # Builds CMake flags to add CUDA paths for include and lib.
  mkCudaFlags = cudaPackages:
    let
      version = cudaPackages.cudaMajorVersion;
      # The DCGM CMake assumes that the folder containing cuda.h contains all headers, so we must
      # combine everything together for headers to work.
      # It would be more convenient to use symlinkJoin on *just* the include subdirectories
      # of each package, but not all of them have an include directory and making that work
      # is more effort than it's worth for this temporary, build-time package.
      combined = symlinkJoin {
        name = "cuda-combined-${version}";
        paths = pkgSet;
      headers = symlinkJoin {
        name = "cuda-headers-combined-${version}";
        paths = lib.map (pkg: "${lib.getInclude pkg}/include") (getCudaPackages cudaPackages);
      };
      # The combined package above breaks the build for some reason so we just configure
      # each package's library path.
      libs = lib.concatMapStringsSep " " (x: ''"${x}/lib"'') pkgSet;
    in ''
      list(APPEND Cuda${version}_INCLUDE_PATHS "${combined}/include")
      list(APPEND Cuda${version}_LIB_PATHS ${libs})
    '';

# gcc11 is required by DCGM's very particular build system
# C.f. https://github.com/NVIDIA/DCGM/blob/7e1012302679e4bb7496483b32dcffb56e528c92/dcgmbuild/build.sh#L22
in gcc11Stdenv.mkDerivation rec {
    in [
      (lib.cmakeFeature "CUDA${version}_INCLUDE_DIR" "${headers}")
      (lib.cmakeFeature "CUDA${version}_LIBS" "${cudaPackages.cuda_cudart.stubs}/lib/stubs/libcuda.so")
      (lib.cmakeFeature "CUDA${version}_STATIC_LIBS" "${lib.getLib cudaPackages.cuda_cudart}/lib/libcudart.so")
      (lib.cmakeFeature "CUDA${version}_STATIC_CUBLAS_LIBS" (lib.concatStringsSep ";" [
        "${lib.getLib cudaPackages.libcublas}/lib/libcublas.so"
        "${lib.getLib cudaPackages.libcublas}/lib/libcublasLt.so"
      ]))
    ];
in stdenv.mkDerivation rec {
  pname = "dcgm";
  version = "3.3.5"; # N.B: If you change this, be sure prometheus-dcgm-exporter supports this version.
  version = "3.3.9"; # N.B: If you change this, be sure prometheus-dcgm-exporter supports this version.

  src = fetchFromGitHub {
    owner = "NVIDIA";
    repo = "DCGM";
    rev = "refs/tags/v${version}";
    hash = "sha256-n/uWvgvxAGfr1X51XgtHfFGDOO5AMBSV5UWQQpsylpg=";
    hash = "sha256-PysxuN5WT7GB0oOvT5ezYeOau6AMVDDWE5HOAcmqw/Y=";
  };

  # Add our paths to the CUDA paths so FindCuda.cmake can find them.
  EXTRA_CUDA_PATHS = lib.concatMapStringsSep "\n" mkAppendCudaPaths cudaPackageSetByVersion;
  prePatch = ''
    echo "$EXTRA_CUDA_PATHS"$'\n'"$(cat cmake/FindCuda.cmake)" > cmake/FindCuda.cmake
  '';
  patches = [
    ./fix-includes.patch
    ./dynamic-libs.patch
  ];

  hardeningDisable = [ "all" ];

@@ -104,26 +85,55 @@ in gcc11Stdenv.mkDerivation rec {
    autoAddDriverRunpath

    cmake
    ninja
    git
    python3
  ];

  buildInputs = [
    # Header-only
    boost
    catch2
    plog.dev
    tclap_1_4

    # Dependencies that can be either static or dynamic.
    (fmt_9.override { enableShared = !static; }) # DCGM's build uses the static outputs regardless of enableShared
    (yaml-cpp.override { inherit static; stdenv = gcc11Stdenv; })

    # TODO: Dependencies that DCGM's CMake hard-codes to be static-only.
    (jsoncpp.override { enableStatic = true; })
    (libevent.override { sslSupport = false; static = true; })
    fmt_9
    yaml-cpp
    jsoncpp
    libevent
  ];

  disallowedReferences = lib.concatMap (x: x.pkgSet) cudaPackageSetByVersion;
  # Add our paths to the CMake flags so FindCuda.cmake can find them.
  cmakeFlags = lib.concatMap mkCudaFlags cudaPackageSets;

  # Lots of dodgy C++.
  env.NIX_CFLAGS_COMPILE = "-Wno-error";

  doCheck = true;

  checkPhase = ''
    runHook preCheck

    ctest -j $NIX_BUILD_CORES --output-on-failure --exclude-regex ${
      lib.escapeShellArg (
        lib.concatMapStringsSep "|" (test: "^${lib.escapeRegex test}$") [
          "DcgmModuleSysmon Watches"
          "DcgmModuleSysmon maxSampleAge"
          "DcgmModuleSysmon::CalculateCoreUtilization"
          "DcgmModuleSysmon::ParseProcStatCpuLine"
          "DcgmModuleSysmon::ParseThermalFileContentsAndStore"
          "DcgmModuleSysmon::PopulateTemperatureFileMap"
          "DcgmModuleSysmon::ReadCoreSpeed"
          "DcgmModuleSysmon::ReadTemperature"
          "Sysmon: initialize module"
        ]
      )
    }

    runHook postCheck
  '';

  disallowedReferences = lib.concatMap getCudaPackages cudaPackageSets;

  meta = with lib; {
    description = "Data Center GPU Manager (DCGM) is a daemon that allows users to monitor NVIDIA data-center GPUs";
Loading