Loading src/pmake/machine.py +28 −9 Original line number Diff line number Diff line Loading @@ -12,6 +12,7 @@ # for R in Rs: # free resources # M.free(R) from typing import Tuple import os, re, time, logging _tick = time.time tick = lambda: _tick()/60.0 Loading Loading @@ -60,19 +61,37 @@ class ResourceSet: def stop(self): self.stop_time = tick() """ Parse output of: $ scontrol show node defiant04 to return (cpus, gpus) per node. """ def parse_scontrol_show_node(s : str) -> Tuple[int, int]: m = re.search(r"CPUTot=([0-9]*)", s) assert m is not None, "CPUTot not found in scontrol show node output" ncpu = int(m[1]) m = re.search(r"Gres=gpu:[a-zA-Z_:]*([0-9]*)", s) assert m is not None, "Gres=gpu not found in scontrol show node output" ngpu = int(m[1]) return ncpu, ngpu def runcmd(*args): ret = subprocess.run(*args, capture_output=True, check=True, encoding="utf-8", universal_newlines=True) return ret.stdout def machine(time): # Generic Slurm if 'SLURM_JOB_NUM_NODES' in os.environ: # FIXME: determine HT count # "grep 'physical id' /proc/cpuinfo | sort -u | wc -l" #import multiprocessing as m #cpu = m.cpu_count() // 2 scpu = os.environ['SLURM_JOB_CPUS_PER_NODE'] m = re.match(r'[0-9]+', scpu) assert m is not None, "Error parsing SLURM_JOB_CPUS_PER_NODE=%s"%scpu cpu = int(m[0]) # Gather SLURM's hardware info from first node in allocation. node = runcmd("scontrol", "show", "hostname").split()[0] ans = runcmd("scontrol", "show", "node", node) ncpu, ngpu = parse_scontrol_show_node(ans) return Machine(time, int(os.environ['SLURM_JOB_NUM_NODES']), 0, cpu, "srun") ngpu, ncpu, "srun") # Summit if 'LSB_MAX_NUM_PROCESSORS' in os.environ: Loading tests/test_machine.py +51 −0 Original line number Diff line number Diff line Loading @@ -34,5 +34,56 @@ class TestMachine(unittest.TestCase): self.assertEqual(str(M.N), start) defiant = """ NodeName=defiant04 Arch=x86_64 CoresPerSocket=16 CPUAlloc=0 CPUEfctv=128 CPUTot=128 CPULoad=0.00 AvailableFeatures=nvme ActiveFeatures=nvme Gres=gpu:4(S:0-3) NodeAddr=defiant04 NodeHostName=defiant04 Version=23.02.6 OS=Linux 5.14.21-150400.24.69_12.0.85-cray_shasta_c #1 SMP Mon Jul 31 18:48:44 UTC 2023 (0febf3b) RealMemory=1 AllocMem=0 FreeMem=249540 Sockets=4 Boards=1 State=IDLE ThreadsPerCore=2 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A Partitions=batch-gpu BootTime=2024-01-19T10:48:51 SlurmdStartTime=2024-01-19T10:51:10 LastBusyTime=2024-01-22T12:23:10 ResumeAfterTime=None CfgTRES=cpu=128,mem=1M,billing=128 AllocTRES= CapWatts=n/a CurrentWatts=0 AveWatts=0 ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s """ wombat = """ NodeName=wombat29 Arch=aarch64 CoresPerSocket=80 CPUAlloc=0 CPUTot=80 CPULoad=0.00 AvailableFeatures=(null) ActiveFeatures=(null) Gres=gpu:ampere:2 NodeAddr=172.30.140.153 NodeHostName=wombat29 Version=20.11.9 OS=Linux 4.18.0-372.9.1.el8.aarch64 #1 SMP Fri Apr 15 22:01:11 EDT 2022 RealMemory=522000 AllocMem=0 FreeMem=459606 Sockets=1 Boards=1 State=IDLE ThreadsPerCore=1 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A Partitions=Ampere BootTime=2023-11-03T10:54:18 SlurmdStartTime=2023-11-03T10:55:12 CfgTRES=cpu=80,mem=522000M,billing=80 AllocTRES= CapWatts=n/a CurrentWatts=0 AveWatts=0 ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s Comment=(null) """ class TestSlurmParse(unittest.TestCase): def test_nvidia(self): ncpu, ngpu = parse_scontrol_show_node(wombat) self.assertEqual(ncpu, 80) self.assertEqual(ngpu, 2) def test_amd(self): ncpu, ngpu = parse_scontrol_show_node(defiant) self.assertEqual(ncpu, 128) self.assertEqual(ngpu, 4) if __name__ == '__main__': unittest.main() Loading
src/pmake/machine.py +28 −9 Original line number Diff line number Diff line Loading @@ -12,6 +12,7 @@ # for R in Rs: # free resources # M.free(R) from typing import Tuple import os, re, time, logging _tick = time.time tick = lambda: _tick()/60.0 Loading Loading @@ -60,19 +61,37 @@ class ResourceSet: def stop(self): self.stop_time = tick() """ Parse output of: $ scontrol show node defiant04 to return (cpus, gpus) per node. """ def parse_scontrol_show_node(s : str) -> Tuple[int, int]: m = re.search(r"CPUTot=([0-9]*)", s) assert m is not None, "CPUTot not found in scontrol show node output" ncpu = int(m[1]) m = re.search(r"Gres=gpu:[a-zA-Z_:]*([0-9]*)", s) assert m is not None, "Gres=gpu not found in scontrol show node output" ngpu = int(m[1]) return ncpu, ngpu def runcmd(*args): ret = subprocess.run(*args, capture_output=True, check=True, encoding="utf-8", universal_newlines=True) return ret.stdout def machine(time): # Generic Slurm if 'SLURM_JOB_NUM_NODES' in os.environ: # FIXME: determine HT count # "grep 'physical id' /proc/cpuinfo | sort -u | wc -l" #import multiprocessing as m #cpu = m.cpu_count() // 2 scpu = os.environ['SLURM_JOB_CPUS_PER_NODE'] m = re.match(r'[0-9]+', scpu) assert m is not None, "Error parsing SLURM_JOB_CPUS_PER_NODE=%s"%scpu cpu = int(m[0]) # Gather SLURM's hardware info from first node in allocation. node = runcmd("scontrol", "show", "hostname").split()[0] ans = runcmd("scontrol", "show", "node", node) ncpu, ngpu = parse_scontrol_show_node(ans) return Machine(time, int(os.environ['SLURM_JOB_NUM_NODES']), 0, cpu, "srun") ngpu, ncpu, "srun") # Summit if 'LSB_MAX_NUM_PROCESSORS' in os.environ: Loading
tests/test_machine.py +51 −0 Original line number Diff line number Diff line Loading @@ -34,5 +34,56 @@ class TestMachine(unittest.TestCase): self.assertEqual(str(M.N), start) defiant = """ NodeName=defiant04 Arch=x86_64 CoresPerSocket=16 CPUAlloc=0 CPUEfctv=128 CPUTot=128 CPULoad=0.00 AvailableFeatures=nvme ActiveFeatures=nvme Gres=gpu:4(S:0-3) NodeAddr=defiant04 NodeHostName=defiant04 Version=23.02.6 OS=Linux 5.14.21-150400.24.69_12.0.85-cray_shasta_c #1 SMP Mon Jul 31 18:48:44 UTC 2023 (0febf3b) RealMemory=1 AllocMem=0 FreeMem=249540 Sockets=4 Boards=1 State=IDLE ThreadsPerCore=2 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A Partitions=batch-gpu BootTime=2024-01-19T10:48:51 SlurmdStartTime=2024-01-19T10:51:10 LastBusyTime=2024-01-22T12:23:10 ResumeAfterTime=None CfgTRES=cpu=128,mem=1M,billing=128 AllocTRES= CapWatts=n/a CurrentWatts=0 AveWatts=0 ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s """ wombat = """ NodeName=wombat29 Arch=aarch64 CoresPerSocket=80 CPUAlloc=0 CPUTot=80 CPULoad=0.00 AvailableFeatures=(null) ActiveFeatures=(null) Gres=gpu:ampere:2 NodeAddr=172.30.140.153 NodeHostName=wombat29 Version=20.11.9 OS=Linux 4.18.0-372.9.1.el8.aarch64 #1 SMP Fri Apr 15 22:01:11 EDT 2022 RealMemory=522000 AllocMem=0 FreeMem=459606 Sockets=1 Boards=1 State=IDLE ThreadsPerCore=1 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A Partitions=Ampere BootTime=2023-11-03T10:54:18 SlurmdStartTime=2023-11-03T10:55:12 CfgTRES=cpu=80,mem=522000M,billing=80 AllocTRES= CapWatts=n/a CurrentWatts=0 AveWatts=0 ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s Comment=(null) """ class TestSlurmParse(unittest.TestCase): def test_nvidia(self): ncpu, ngpu = parse_scontrol_show_node(wombat) self.assertEqual(ncpu, 80) self.assertEqual(ngpu, 2) def test_amd(self): ncpu, ngpu = parse_scontrol_show_node(defiant) self.assertEqual(ncpu, 128) self.assertEqual(ngpu, 4) if __name__ == '__main__': unittest.main()