Unverified Commit 9077cf6d authored by Gaétan Lepage's avatar Gaétan Lepage Committed by GitHub
Browse files

Revert "nvidia-container-toolkit: wait for all device nodes to be present" (#463702)

parents a0a1272d c94dd0f6
Loading
Loading
Loading
Loading
+1 −38
Original line number Diff line number Diff line
@@ -289,6 +289,7 @@

      systemd.services.nvidia-container-toolkit-cdi-generator = {
        description = "Container Device Interface (CDI) for Nvidia generator";
        after = [ "systemd-udev-settle.service" ];
        requiredBy = lib.mkMerge [
          (lib.mkIf config.virtualisation.docker.enable [ "docker.service" ])
          (lib.mkIf config.virtualisation.podman.enable [ "podman.service" ])
@@ -297,44 +298,6 @@
        serviceConfig = {
          RuntimeDirectory = "cdi";
          RemainAfterExit = true;
          ExecStartPre = pkgs.writeShellScript "wait-for-nvidia-devices" ''
            set -eu

            gpus_dir="/proc/driver/nvidia/gpus"
            max_wait_seconds=60

            if [ ! -d "$gpus_dir" ]; then
              echo "wait-for-nvidia-devices: $gpus_dir does not exist; nothing to wait for."
              exit 0
            fi

            gpu_count=$(find "$gpus_dir" -mindepth 1 -maxdepth 1 -type d 2>/dev/null | wc -l | tr -d ' ')

            if [ "$gpu_count" -eq 0 ]; then
              echo "wait-for-nvidia-devices: no GPU entries found in $gpus_dir; nothing to wait for."
              exit 0
            fi

            echo "wait-for-nvidia-devices: expecting $gpu_count /dev/nvidiaN device node(s)."

            elapsed=0
            while true; do
              dev_count=$(find /dev -mindepth 1 -maxdepth 1 -type c -regex '.*/nvidia[0-9]+' 2>/dev/null | wc -l | tr -d ' ')

              if [ "$dev_count" -eq "$gpu_count" ]; then
                echo "wait-for-nvidia-devices: found $dev_count matching device node(s)."
                exit 0
              fi

              if [ "$elapsed" -ge "$max_wait_seconds" ]; then
                echo "wait-for-nvidia-devices: timed out after $max_wait_seconds seconds; expected $gpu_count node(s) but found $dev_count." >&2
                exit 1
              fi

              sleep 1
              elapsed=$((elapsed + 1))
            done
          '';
          ExecStart =
            let
              script = pkgs.callPackage ./cdi-generate.nix {