dcgm: 3.3.5 -> 3.3.9; cudaPackages_10{,_0,_1,_2}: drop (#357655) (811c0af5) · Commits · nix / nixpkgs

nixos/doc/manual/release-notes/rl-2505.section.md

+2 −0

Original line number	Diff line number	Diff line
		@@ -28,6 +28,8 @@
		After you run ALTER EXTENSION, you must run [this SQL script](https://github.com/timescale/timescaledb-extras/blob/master/utils/2.15.X-fix_hypertable_foreign_keys.sql). For more details, see the following pull requests [#6797](https://github.com/timescale/timescaledb/pull/6797).
		PostgreSQL 13 is no longer supported in TimescaleDB v2.16.

		- Support for CUDA 10 has been dropped, as announced in the 24.11 release notes.

		- `kanata` was updated to v1.7.0, which introduces several breaking changes.
		See the release notes of
		[v1.7.0](https://github.com/jtroo/kanata/releases/tag/v1.7.0)

pkgs/applications/science/math/caffe/default.nix

+2 −28

Original line number	Diff line number	Diff line
		@@ -14,23 +14,11 @@
		, Accelerate, CoreGraphics, CoreVideo
		, lmdbSupport ? true, lmdb
		, leveldbSupport ? true, leveldb, snappy
		, cudaSupport ? config.cudaSupport, cudaPackages ? { }
		, cudnnSupport ? cudaSupport
		, ncclSupport ? false
		, pythonSupport ? false, python ? null, numpy ? null
		, substituteAll
		}:

		let
		inherit (cudaPackages) backendStdenv cudatoolkit nccl;
		# The default for cudatoolkit 10.1 is CUDNN 8.0.5, the last version to support CUDA 10.1.
		# However, this caffe does not build with CUDNN 8.x, so we use CUDNN 7.6.5 instead.
		# Earlier versions of cudatoolkit use pre-8.x CUDNN, so we use the default.
		hasCudnn =
		if lib.versionOlder cudatoolkit.version "10.1"
		then cudaPackages ? cudnn
		else cudaPackages ? cudnn_7_6;

		toggle = bool: if bool then "ON" else "OFF";

		test_model_weights = fetchurl {
		@@ -57,20 +45,12 @@ stdenv.mkDerivation rec {
		# boost_python expects
		[ (if pythonSupport then "-Dpython_version=${python.pythonVersion}" else "-DBUILD_python=OFF")
		"-DBLAS=open"
		] ++ (if cudaSupport then [
		"-DCUDA_ARCH_NAME=All"
		"-DCUDA_HOST_COMPILER=${backendStdenv.cc}/bin/cc"
		] else [ "-DCPU_ONLY=ON" ])
		++ ["-DUSE_NCCL=${toggle ncclSupport}"]
		++ ["-DUSE_LEVELDB=${toggle leveldbSupport}"]
		"-DCPU_ONLY=ON"
		] ++ ["-DUSE_LEVELDB=${toggle leveldbSupport}"]
		++ ["-DUSE_LMDB=${toggle lmdbSupport}"];

		buildInputs = [ boost gflags glog protobuf hdf5-cpp opencv4 blas ]
		++ lib.optional cudaSupport cudatoolkit
		++ lib.optional (lib.versionOlder cudatoolkit.version "10.1" && hasCudnn) cudaPackages.cudnn
		++ lib.optional (lib.versionAtLeast cudatoolkit.version "10.1" && hasCudnn) cudaPackages.cudnn_7_6
		++ lib.optional lmdbSupport lmdb
		++ lib.optional ncclSupport nccl
		++ lib.optionals leveldbSupport [ leveldb snappy ]
		++ lib.optionals pythonSupport [ python numpy ]
		++ lib.optionals stdenv.hostPlatform.isDarwin [ Accelerate CoreGraphics CoreVideo ]
		@@ -105,9 +85,6 @@ stdenv.mkDerivation rec {
		substituteInPlace src/caffe/util/io.cpp --replace \
		'SetTotalBytesLimit(kProtoReadBytesLimit, 536870912)' \
		'SetTotalBytesLimit(kProtoReadBytesLimit)'
		'' + lib.optionalString (cudaSupport && lib.versionAtLeast cudatoolkit.version "9.0") ''
		# CUDA 9.0 doesn't support sm_20
		sed -i 's,20 21(20) ,,' cmake/Cuda.cmake
		'';

		preConfigure = lib.optionalString pythonSupport ''
		@@ -150,10 +127,7 @@ stdenv.mkDerivation rec {
		maintainers = [ ];
		broken =
		(pythonSupport && (python.isPy310))
		\|\| cudaSupport
		\|\| !(leveldbSupport -> (leveldb != null && snappy != null))
		\|\| !(cudnnSupport -> (hasCudnn && cudaSupport))
		\|\| !(ncclSupport -> (cudaSupport && !nccl.meta.unsupported))
		\|\| !(pythonSupport -> (python != null && numpy != null))
		;
		license = licenses.bsd2;

pkgs/by-name/dc/dcgm/dynamic-libs.patch

0 → 100644

+40 −0

Original line number	Diff line number	Diff line
		diff --git a/cmake/FindJsoncpp.cmake b/cmake/FindJsoncpp.cmake
		index abedf7bb85..d7a52f5c7b 100644
		--- a/cmake/FindJsoncpp.cmake
		+++ b/cmake/FindJsoncpp.cmake
		@@ -16,8 +16,8 @@

		if (NOT TARGET JsonCpp::JsonCpp)
		find_package(jsoncpp REQUIRED CONFIG)
		- set(JSONCPP_STATIC_LIBS jsoncpp_static)
		- set(JSONCPP_INCLUDE_PATH $<TARGET_PROPERTY:jsoncpp_static,INTERFACE_INCLUDE_DIRECTORIES>)
		+ set(JSONCPP_STATIC_LIBS jsoncpp)
		+ set(JSONCPP_INCLUDE_PATH $<TARGET_PROPERTY:jsoncpp,INTERFACE_INCLUDE_DIRECTORIES>)
		endif()
		# set(Jsoncpp_PATH_PREFIXES /usr/local "${Jsoncpp_ROOT}" "$ENV{HOME}")
		# foreach(prefix ${Jsoncpp_PATH_PREFIXES})
		diff --git a/cmake/FindLibevent.cmake b/cmake/FindLibevent.cmake
		index 354d6f9b99..38aca9534e 100644
		--- a/cmake/FindLibevent.cmake
		+++ b/cmake/FindLibevent.cmake
		@@ -26,16 +26,12 @@
		endforeach()

		find_path(LIBEVENT_INCLUDE_DIR evhttp.h event.h PATHS ${Libevent_INCLUDE_PATHS})
		-find_library(LIBEVENT_STATIC_LIB NAMES libevent.a libevent_core.a libevent_extra.a PATHS ${Libevent_LIB_PATHS})
		-find_library(LIBEVENT_PTHREAD_STATIC_LIB NAMES libevent_pthreads.a PATHS ${Libevent_LIB_PATHS})
		+find_library(LIBEVENT_STATIC_LIB NAMES libevent.so libevent_core.so libevent_extra.so PATHS ${Libevent_LIB_PATHS})
		+find_library(LIBEVENT_PTHREAD_STATIC_LIB NAMES libevent_pthreads.so PATHS ${Libevent_LIB_PATHS})

		-if (LIBEVENT_INCLUDE_DIR AND LIBEVENT_STATIC_LIB AND LIBEVENT_PTHREAD_STATIC_LIB)
		+if (LIBEVENT_INCLUDE_DIR AND LIBEVENT_STATIC_LIB)
		set(Libevent_FOUND TRUE)
		- add_library(libevent_event_static STATIC IMPORTED)
		- set_target_properties(libevent_event_static PROPERTIES IMPORTED_LOCATION ${LIBEVENT_STATIC_LIB})
		- add_library(libevent_event_pthread STATIC IMPORTED)
		- set_target_properties(libevent_event_pthread PROPERTIES IMPORTED_LOCATION ${LIBEVENT_PTHREAD_STATIC_LIB})
		- set(LIBEVENT_STATIC_LIBS libevent_event_static libevent_event_pthread)
		+ set(LIBEVENT_STATIC_LIBS ${LIBEVENT_STATIC_LIB} ${LIBEVENT_PTHREAD_STATIC_LIB})
		else ()
		set(Libevent_FOUND FALSE)
		endif ()

pkgs/by-name/dc/dcgm/fix-includes.patch

0 → 100644

+110 −0

Original line number	Diff line number	Diff line
		diff --git a/common/CudaWorker/DcgmDgemm.cpp b/common/CudaWorker/DcgmDgemm.cpp
		index 8d33a3256e..6b3284258d 100644
		--- a/common/CudaWorker/DcgmDgemm.cpp
		+++ b/common/CudaWorker/DcgmDgemm.cpp
		@@ -17,6 +17,7 @@

		#include <exception>
		#include <stdexcept>
		+#include <cinttypes>

		#define CU_CHK(op) \
		if (auto const status = op; status != CUBLAS_STATUS_SUCCESS) \
		@@ -122,4 +123,4 @@
		return CUBLAS_STATUS_SUCCESS;
		}

		-} // namespace DcgmNs
		\ No newline at end of file
		+} // namespace DcgmNs
		diff --git a/common/DcgmError.h b/common/DcgmError.h
		index 8638cdceb1..e8d817c0d4 100644
		--- a/common/DcgmError.h
		+++ b/common/DcgmError.h
		@@ -17,6 +17,7 @@

		#include <sstream>
		#include <string>
		+#include <cinttypes>

		#include <dcgm_agent.h>
		#include <dcgm_errors.h>
		diff --git a/common/DcgmStringHelpers.cpp b/common/DcgmStringHelpers.cpp
		index b41917e3b7..1fe63980c7 100644
		--- a/common/DcgmStringHelpers.cpp
		+++ b/common/DcgmStringHelpers.cpp
		@@ -17,6 +17,7 @@

		#include <cstring>
		#include <string>
		+#include <algorithm>

		/*****************************************************************************/
		void dcgmTokenizeString(const std::string &src, const std::string &delimiter, std::vector<std::string> &tokens)
		diff --git a/dcgmi/CommandOutputController.cpp b/dcgmi/CommandOutputController.cpp
		index 5057205564..8520171efa 100644
		--- a/dcgmi/CommandOutputController.cpp
		+++ b/dcgmi/CommandOutputController.cpp
		@@ -24,6 +24,7 @@
		#include "dcgm_agent.h"
		#include <DcgmStringHelpers.h>
		#include <algorithm>
		+#include <functional>
		#include <cstdarg>
		#include <iostream>
		#include <string>
		diff --git a/dcgmi/Diag.h b/dcgmi/Diag.h
		index a326f7b949..563fb3c9c0 100755
		--- a/dcgmi/Diag.h
		+++ b/dcgmi/Diag.h
		@@ -24,6 +24,7 @@
		#define DIAG_H_

		#include <optional>
		+#include <functional>

		#include "Command.h"
		#include "CommandOutputController.h"
		diff --git a/hostengine/src/HostEngineOutput.cpp b/hostengine/src/HostEngineOutput.cpp
		index 23c6ca9f54..798b83b3e4 100644
		--- a/hostengine/src/HostEngineOutput.cpp
		+++ b/hostengine/src/HostEngineOutput.cpp
		@@ -20,6 +20,7 @@
		#include <iostream>
		#include <limits>
		#include <string_view>
		+#include <unordered_map>

		namespace
		{
		@@ -365,4 +366,4 @@
		}
		}
		os << std::endl;
		-}
		\ No newline at end of file
		+}
		diff --git a/nvvs/src/NvvsCommon.cpp b/nvvs/src/NvvsCommon.cpp
		index 8f7888649b..1604d9dabe 100644
		--- a/nvvs/src/NvvsCommon.cpp
		+++ b/nvvs/src/NvvsCommon.cpp
		@@ -15,6 +15,7 @@
		*/
		#include <sstream>
		#include <stdexcept>
		+#include <algorithm>
		#include <sys/stat.h>
		#include <sys/types.h>

		diff --git a/sdk/nvidia/nvml/nvml_loader/nvml_loader.cpp b/sdk/nvidia/nvml/nvml_loader/nvml_loader.cpp
		index 9eebeaf1c4..6e21201229 100644
		--- a/sdk/nvidia/nvml/nvml_loader/nvml_loader.cpp
		+++ b/sdk/nvidia/nvml/nvml_loader/nvml_loader.cpp
		@@ -20,6 +20,7 @@

		#include <atomic>
		#include <mutex>
		+#include <cstdlib>

		static void *g_nvmlLib = 0;
		static std::atomic_uint32_t g_nvmlStaticLibResetHooksCount = 1;

pkgs/by-name/dc/dcgm/package.nix

+68 −58

Original line number	Diff line number	Diff line
		{ lib
		, gcc11Stdenv
		, stdenv
		, fetchFromGitHub
		, autoAddDriverRunpath
		, catch2
		, cmake
		, cudaPackages_10_2
		, ninja
		, cudaPackages_11_8
		, cudaPackages_12
		, boost
		, fmt_9
		, git
		, jsoncpp
		@@ -16,30 +17,14 @@
		, symlinkJoin
		, tclap_1_4
		, yaml-cpp

		, static ? gcc11Stdenv.hostPlatform.isStatic
		}:
		let
		# DCGM depends on 3 different versions of CUDA at the same time.
		# The runtime closure, thankfully, is quite small because most things
		# are statically linked.
		cudaPackageSetByVersion = [
		{
		version = "10";
		# Nixpkgs cudaPackages_10 doesn't have redist packages broken out.
		pkgSet = [
		cudaPackages_10_2.cudatoolkit
		cudaPackages_10_2.cudatoolkit.lib
		];
		}
		{
		version = "11";
		pkgSet = getCudaPackages cudaPackages_11_8;
		}
		{
		version = "12";
		pkgSet = getCudaPackages cudaPackages_12;
		}
		# DCGM depends on 2 different versions of CUDA at the same time.
		# The runtime closure, thankfully, is quite small as it does not
		# include the CUDA libraries.
		cudaPackageSets = [
		cudaPackages_11_8
		cudaPackages_12
		];

		# Select needed redist packages from cudaPackages
		@@ -54,44 +39,40 @@ let
		libcurand
		];

		# Builds CMake code to add CUDA paths for include and lib.
		mkAppendCudaPaths = { version, pkgSet }:
		# Builds CMake flags to add CUDA paths for include and lib.
		mkCudaFlags = cudaPackages:
		let
		version = cudaPackages.cudaMajorVersion;
		# The DCGM CMake assumes that the folder containing cuda.h contains all headers, so we must
		# combine everything together for headers to work.
		# It would be more convenient to use symlinkJoin on just the include subdirectories
		# of each package, but not all of them have an include directory and making that work
		# is more effort than it's worth for this temporary, build-time package.
		combined = symlinkJoin {
		name = "cuda-combined-${version}";
		paths = pkgSet;
		headers = symlinkJoin {
		name = "cuda-headers-combined-${version}";
		paths = lib.map (pkg: "${lib.getInclude pkg}/include") (getCudaPackages cudaPackages);
		};
		# The combined package above breaks the build for some reason so we just configure
		# each package's library path.
		libs = lib.concatMapStringsSep " " (x: ''"${x}/lib"'') pkgSet;
		in ''
		list(APPEND Cuda${version}_INCLUDE_PATHS "${combined}/include")
		list(APPEND Cuda${version}_LIB_PATHS ${libs})
		'';

		# gcc11 is required by DCGM's very particular build system
		# C.f. https://github.com/NVIDIA/DCGM/blob/7e1012302679e4bb7496483b32dcffb56e528c92/dcgmbuild/build.sh#L22
		in gcc11Stdenv.mkDerivation rec {
		in [
		(lib.cmakeFeature "CUDA${version}_INCLUDE_DIR" "${headers}")
		(lib.cmakeFeature "CUDA${version}_LIBS" "${cudaPackages.cuda_cudart.stubs}/lib/stubs/libcuda.so")
		(lib.cmakeFeature "CUDA${version}_STATIC_LIBS" "${lib.getLib cudaPackages.cuda_cudart}/lib/libcudart.so")
		(lib.cmakeFeature "CUDA${version}_STATIC_CUBLAS_LIBS" (lib.concatStringsSep ";" [
		"${lib.getLib cudaPackages.libcublas}/lib/libcublas.so"
		"${lib.getLib cudaPackages.libcublas}/lib/libcublasLt.so"
		]))
		];
		in stdenv.mkDerivation rec {
		pname = "dcgm";
		version = "3.3.5"; # N.B: If you change this, be sure prometheus-dcgm-exporter supports this version.
		version = "3.3.9"; # N.B: If you change this, be sure prometheus-dcgm-exporter supports this version.

		src = fetchFromGitHub {
		owner = "NVIDIA";
		repo = "DCGM";
		rev = "refs/tags/v${version}";
		hash = "sha256-n/uWvgvxAGfr1X51XgtHfFGDOO5AMBSV5UWQQpsylpg=";
		hash = "sha256-PysxuN5WT7GB0oOvT5ezYeOau6AMVDDWE5HOAcmqw/Y=";
		};

		# Add our paths to the CUDA paths so FindCuda.cmake can find them.
		EXTRA_CUDA_PATHS = lib.concatMapStringsSep "\n" mkAppendCudaPaths cudaPackageSetByVersion;
		prePatch = ''
		echo "$EXTRA_CUDA_PATHS"$'\n'"$(cat cmake/FindCuda.cmake)" > cmake/FindCuda.cmake
		'';
		patches = [
		./fix-includes.patch
		./dynamic-libs.patch
		];

		hardeningDisable = [ "all" ];

		@@ -104,26 +85,55 @@ in gcc11Stdenv.mkDerivation rec {
		autoAddDriverRunpath

		cmake
		ninja
		git
		python3
		];

		buildInputs = [
		# Header-only
		boost
		catch2
		plog.dev
		tclap_1_4

		# Dependencies that can be either static or dynamic.
		(fmt_9.override { enableShared = !static; }) # DCGM's build uses the static outputs regardless of enableShared
		(yaml-cpp.override { inherit static; stdenv = gcc11Stdenv; })

		# TODO: Dependencies that DCGM's CMake hard-codes to be static-only.
		(jsoncpp.override { enableStatic = true; })
		(libevent.override { sslSupport = false; static = true; })
		fmt_9
		yaml-cpp
		jsoncpp
		libevent
		];

		disallowedReferences = lib.concatMap (x: x.pkgSet) cudaPackageSetByVersion;
		# Add our paths to the CMake flags so FindCuda.cmake can find them.
		cmakeFlags = lib.concatMap mkCudaFlags cudaPackageSets;

		# Lots of dodgy C++.
		env.NIX_CFLAGS_COMPILE = "-Wno-error";

		doCheck = true;

		checkPhase = ''
		runHook preCheck

		ctest -j $NIX_BUILD_CORES --output-on-failure --exclude-regex ${
		lib.escapeShellArg (
		lib.concatMapStringsSep "\|" (test: "^${lib.escapeRegex test}$") [
		"DcgmModuleSysmon Watches"
		"DcgmModuleSysmon maxSampleAge"
		"DcgmModuleSysmon::CalculateCoreUtilization"
		"DcgmModuleSysmon::ParseProcStatCpuLine"
		"DcgmModuleSysmon::ParseThermalFileContentsAndStore"
		"DcgmModuleSysmon::PopulateTemperatureFileMap"
		"DcgmModuleSysmon::ReadCoreSpeed"
		"DcgmModuleSysmon::ReadTemperature"
		"Sysmon: initialize module"
		]
		)
		}

		runHook postCheck
		'';

		disallowedReferences = lib.concatMap getCudaPackages cudaPackageSets;

		meta = with lib; {
		description = "Data Center GPU Manager (DCGM) is a daemon that allows users to monitor NVIDIA data-center GPUs";