Commit 47169462 authored by Rogers, David's avatar Rogers, David
Browse files

Merge branch 'slurm_resources' into 'master'

Slurm resources

See merge request !3
parents 166f778f ed96fa77
Loading
Loading
Loading
Loading
+13 −0
Original line number Diff line number Diff line
rungpu:
  resource:
      time: 1.0
      nrs: 1 # 1 resource set
      cpu: 14
      gpu: 2
      srun_attr: "--gpu-bind=closest"
  inp: []
  out:
    - rungpu.log
  script: |
    {mpirun} ../hello_jobstep/hello_jobstep >rungpu.log
    sleep 20
+52 −0
Original line number Diff line number Diff line
# This example shows a 2-step workflow that composes
# a grompp step together with an mdrun.
run1:
  dirname: run1
  out:
      - rungpu.log

run2:
  dirname: run2
  out:
      - rungpu.log

run3:
  dirname: run3
  out:
      - rungpu.log

run4:
  dirname: run4
  out:
      - rungpu.log

run5:
  dirname: run5
  out:
      - rungpu.log

run6:
  dirname: run6
  out:
      - rungpu.log

run7:
  dirname: run7
  out:
      - rungpu.log

run8:
  dirname: run8
  out:
      - rungpu.log

run9:
  dirname: run9
  out:
      - rungpu.log

run10:
  dirname: run10
  out:
      - rungpu.log
+22 −0
Original line number Diff line number Diff line
#!/bin/bash
#SBATCH -A stf006
#SBATCH -t 0:10
#SBATCH -N 2
#SBATCH -J gpu_test
#SBATCH -o gpu_test.%J

### Build
# git clone https://code.ornl.gov/olcf/hello_jobstep.git
# cd hello_jobstep
# module load craype-accel-amd-gfx90a rocm
# make

### Load the python env
# module load cray-python
# python3 -m venv venv
# . venv/bin/activate
# (cd ../../ && pip install -e)

for((i=1;i<=10;i++)); do mkdir -p run$i; done
rm run[0-9]*/rungpu.log
pmake rules.yaml targets.yaml 10
+34 −11
Original line number Diff line number Diff line
@@ -12,6 +12,8 @@
#   for R in Rs: # free resources
#      M.free(R)

from typing import Tuple
import subprocess
import os, re, time, logging
_tick = time.time
tick = lambda: _tick()/60.0
@@ -48,9 +50,11 @@ class ResourceSet:
    def min_nodes(self, CpN, GpN):
        if self.cpu > CpN or self.gpu > GpN or self.cpu < 1 or self.gpu < 0:
            return None
        # determine `n` = max nrs per node
        n = CpN // self.cpu
        if self.gpu > 0:
            n = min(n, GpN//self.gpu) # max nrs per node
            n = min(n, GpN//self.gpu)
        # nrs / max nrs per node = min nodes
        return (self.nrs+n-1) // n

    def ready(self):
@@ -60,19 +64,37 @@ class ResourceSet:
    def stop(self):
        self.stop_time = tick()

"""
Parse output of:

$ scontrol show node defiant04

to return (cpus, gpus) per node.
"""

def parse_scontrol_show_node(s : str) -> Tuple[int, int]:
    m = re.search(r"CPUTot=([0-9]*)", s)
    assert m is not None, "CPUTot not found in scontrol show node output"
    ncpu = int(m[1])
    m = re.search(r"Gres=gpu:[a-zA-Z_:]*([0-9]*)", s)
    assert m is not None, "Gres=gpu not found in scontrol show node output"
    ngpu = int(m[1])
    return ncpu, ngpu

def runcmd(*args):
    ret = subprocess.run(args, capture_output=True, check=True, encoding="utf-8", universal_newlines=True)
    return ret.stdout

def machine(time):
    # Generic Slurm
    if 'SLURM_JOB_NUM_NODES' in os.environ:
        # FIXME: determine HT count
        # "grep 'physical id' /proc/cpuinfo | sort -u | wc -l"
        #import multiprocessing as m
        #cpu = m.cpu_count() // 2
        scpu = os.environ['SLURM_JOB_CPUS_PER_NODE']
        m = re.match(r'[0-9]+', scpu)
        assert m is not None, "Error parsing SLURM_JOB_CPUS_PER_NODE=%s"%scpu
        cpu = int(m[0])
        # Gather SLURM's hardware info from first node in allocation.
        node = runcmd("scontrol", "show", "hostname").split()[0]
        ans = runcmd("scontrol", "show", "node", node)
        ncpu, ngpu = parse_scontrol_show_node(ans)

        return Machine(time, int(os.environ['SLURM_JOB_NUM_NODES']),
                       0, cpu, "srun")
                       ngpu, ncpu, "srun")

    # Summit
    if 'LSB_MAX_NUM_PROCESSORS' in os.environ:
@@ -99,7 +121,8 @@ class Machine:
        if self.batch == 'srun':
            nodes = R.min_nodes(self.CpN, self.GpN)
            assert nodes is not None
            srun = "srun --exclusive -N %d --cpu-bind=cores --ntasks %d -c %d -G %d" % (
            #srun = "srun --exclusive -N %d --cpu-bind=cores --ntasks %d -c %d -G %d" % (
            srun = "srun --exclusive -N %d --ntasks %d -c %d -G %d" % (
                         nodes, R.nrs*R.tasks, R.nrs*R.cpu//nodes, R.nrs*R.gpu)
            if R.srun_attr is not None:
                srun += " %s" % R.srun_attr
+51 −0
Original line number Diff line number Diff line
@@ -34,5 +34,56 @@ class TestMachine(unittest.TestCase):

        self.assertEqual(str(M.N), start)

defiant = """
   NodeName=defiant04 Arch=x86_64 CoresPerSocket=16
   CPUAlloc=0 CPUEfctv=128 CPUTot=128 CPULoad=0.00
   AvailableFeatures=nvme
   ActiveFeatures=nvme
   Gres=gpu:4(S:0-3)
   NodeAddr=defiant04 NodeHostName=defiant04 Version=23.02.6
   OS=Linux 5.14.21-150400.24.69_12.0.85-cray_shasta_c #1 SMP Mon Jul 31 18:48:44 UTC 2023 (0febf3b)
   RealMemory=1 AllocMem=0 FreeMem=249540 Sockets=4 Boards=1
   State=IDLE ThreadsPerCore=2 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A
   Partitions=batch-gpu
   BootTime=2024-01-19T10:48:51 SlurmdStartTime=2024-01-19T10:51:10
   LastBusyTime=2024-01-22T12:23:10 ResumeAfterTime=None
   CfgTRES=cpu=128,mem=1M,billing=128
   AllocTRES=
   CapWatts=n/a
   CurrentWatts=0 AveWatts=0
   ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s
"""

wombat = """
   NodeName=wombat29 Arch=aarch64 CoresPerSocket=80
   CPUAlloc=0 CPUTot=80 CPULoad=0.00
   AvailableFeatures=(null)
   ActiveFeatures=(null)
   Gres=gpu:ampere:2
   NodeAddr=172.30.140.153 NodeHostName=wombat29 Version=20.11.9
   OS=Linux 4.18.0-372.9.1.el8.aarch64 #1 SMP Fri Apr 15 22:01:11 EDT 2022
   RealMemory=522000 AllocMem=0 FreeMem=459606 Sockets=1 Boards=1
   State=IDLE ThreadsPerCore=1 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A
   Partitions=Ampere
   BootTime=2023-11-03T10:54:18 SlurmdStartTime=2023-11-03T10:55:12
   CfgTRES=cpu=80,mem=522000M,billing=80
   AllocTRES=
   CapWatts=n/a
   CurrentWatts=0 AveWatts=0
   ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s
   Comment=(null)
"""

class TestSlurmParse(unittest.TestCase):
    def test_nvidia(self):
        ncpu, ngpu = parse_scontrol_show_node(wombat)
        self.assertEqual(ncpu, 80)
        self.assertEqual(ngpu, 2)

    def test_amd(self):
        ncpu, ngpu = parse_scontrol_show_node(defiant)
        self.assertEqual(ncpu, 128)
        self.assertEqual(ngpu, 4)

if __name__ == '__main__':
    unittest.main()