Commit 4fa6dae5 authored by David M. Rogers's avatar David M. Rogers
Browse files

Portable SLURM ncpu/ngpu detection.

parent 166f778f
Loading
Loading
Loading
Loading
+28 −9
Original line number Diff line number Diff line
@@ -12,6 +12,7 @@
#   for R in Rs: # free resources
#      M.free(R)

from typing import Tuple
import os, re, time, logging
_tick = time.time
tick = lambda: _tick()/60.0
@@ -60,19 +61,37 @@ class ResourceSet:
    def stop(self):
        self.stop_time = tick()

"""
Parse output of:

$ scontrol show node defiant04

to return (cpus, gpus) per node.
"""

def parse_scontrol_show_node(s : str) -> Tuple[int, int]:
    m = re.search(r"CPUTot=([0-9]*)", s)
    assert m is not None, "CPUTot not found in scontrol show node output"
    ncpu = int(m[1])
    m = re.search(r"Gres=gpu:[a-zA-Z_:]*([0-9]*)", s)
    assert m is not None, "Gres=gpu not found in scontrol show node output"
    ngpu = int(m[1])
    return ncpu, ngpu

def runcmd(*args):
    ret = subprocess.run(*args, capture_output=True, check=True, encoding="utf-8", universal_newlines=True)
    return ret.stdout

def machine(time):
    # Generic Slurm
    if 'SLURM_JOB_NUM_NODES' in os.environ:
        # FIXME: determine HT count
        # "grep 'physical id' /proc/cpuinfo | sort -u | wc -l"
        #import multiprocessing as m
        #cpu = m.cpu_count() // 2
        scpu = os.environ['SLURM_JOB_CPUS_PER_NODE']
        m = re.match(r'[0-9]+', scpu)
        assert m is not None, "Error parsing SLURM_JOB_CPUS_PER_NODE=%s"%scpu
        cpu = int(m[0])
        # Gather SLURM's hardware info from first node in allocation.
        node = runcmd("scontrol", "show", "hostname").split()[0]
        ans = runcmd("scontrol", "show", "node", node)
        ncpu, ngpu = parse_scontrol_show_node(ans)

        return Machine(time, int(os.environ['SLURM_JOB_NUM_NODES']),
                       0, cpu, "srun")
                       ngpu, ncpu, "srun")

    # Summit
    if 'LSB_MAX_NUM_PROCESSORS' in os.environ:
+51 −0
Original line number Diff line number Diff line
@@ -34,5 +34,56 @@ class TestMachine(unittest.TestCase):

        self.assertEqual(str(M.N), start)

defiant = """
   NodeName=defiant04 Arch=x86_64 CoresPerSocket=16
   CPUAlloc=0 CPUEfctv=128 CPUTot=128 CPULoad=0.00
   AvailableFeatures=nvme
   ActiveFeatures=nvme
   Gres=gpu:4(S:0-3)
   NodeAddr=defiant04 NodeHostName=defiant04 Version=23.02.6
   OS=Linux 5.14.21-150400.24.69_12.0.85-cray_shasta_c #1 SMP Mon Jul 31 18:48:44 UTC 2023 (0febf3b)
   RealMemory=1 AllocMem=0 FreeMem=249540 Sockets=4 Boards=1
   State=IDLE ThreadsPerCore=2 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A
   Partitions=batch-gpu
   BootTime=2024-01-19T10:48:51 SlurmdStartTime=2024-01-19T10:51:10
   LastBusyTime=2024-01-22T12:23:10 ResumeAfterTime=None
   CfgTRES=cpu=128,mem=1M,billing=128
   AllocTRES=
   CapWatts=n/a
   CurrentWatts=0 AveWatts=0
   ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s
"""

wombat = """
   NodeName=wombat29 Arch=aarch64 CoresPerSocket=80
   CPUAlloc=0 CPUTot=80 CPULoad=0.00
   AvailableFeatures=(null)
   ActiveFeatures=(null)
   Gres=gpu:ampere:2
   NodeAddr=172.30.140.153 NodeHostName=wombat29 Version=20.11.9
   OS=Linux 4.18.0-372.9.1.el8.aarch64 #1 SMP Fri Apr 15 22:01:11 EDT 2022
   RealMemory=522000 AllocMem=0 FreeMem=459606 Sockets=1 Boards=1
   State=IDLE ThreadsPerCore=1 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A
   Partitions=Ampere
   BootTime=2023-11-03T10:54:18 SlurmdStartTime=2023-11-03T10:55:12
   CfgTRES=cpu=80,mem=522000M,billing=80
   AllocTRES=
   CapWatts=n/a
   CurrentWatts=0 AveWatts=0
   ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s
   Comment=(null)
"""

class TestSlurmParse(unittest.TestCase):
    def test_nvidia(self):
        ncpu, ngpu = parse_scontrol_show_node(wombat)
        self.assertEqual(ncpu, 80)
        self.assertEqual(ngpu, 2)

    def test_amd(self):
        ncpu, ngpu = parse_scontrol_show_node(defiant)
        self.assertEqual(ncpu, 128)
        self.assertEqual(ngpu, 4)

if __name__ == '__main__':
    unittest.main()